fix_mmdetection

eb1107e4 · raojy · 7aa442d5 · eb1107e4 · eb1107e4 · eb1107e4
Commit eb1107e4 authored Apr 01, 2026 by raojy
20 changed files
--- a/mmde/configs/free_anchor/pointpillars_hv_fpn_head-free-anchor_sbn-all_8xb4-2x_nus-3d.py
+++ b/mmde/configs/free_anchor/pointpillars_hv_fpn_head-free-anchor_sbn-all_8xb4-2x_nus-3d.py
+_base_ = [
+    '../_base_/models/pointpillars_hv_fpn_nus.py',
+    '../_base_/datasets/nus-3d.py', '../_base_/schedules/schedule-2x.py',
+    '../_base_/default_runtime.py'
+]
+model = dict(
+    pts_bbox_head=dict(
+        _delete_=True,
+        type='FreeAnchor3DHead',
+        num_classes=10,
+        in_channels=256,
+        feat_channels=256,
+        use_direction_classifier=True,
+        pre_anchor_topk=25,
+        bbox_thr=0.5,
+        gamma=2.0,
+        alpha=0.5,
+        anchor_generator=dict(
+            type='AlignedAnchor3DRangeGenerator',
+            ranges=[[-50, -50, -1.8, 50, 50, -1.8]],
+            scales=[1, 2, 4],
+            sizes=[
+                [2.5981, 0.8660, 1.],  # 1.5 / sqrt(3)
+                [1.7321, 0.5774, 1.],  # 1 / sqrt(3)
+                [1., 1., 1.],
+                [0.4, 0.4, 1],
+            ],
+            custom_values=[0, 0],
+            rotations=[0, 1.57],
+            reshape_out=True),
+        assigner_per_size=False,
+        diff_rad_by_sin=True,
+        dir_offset=-0.7854,  # -pi / 4
+        bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=9),
+        loss_cls=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=0.8),
+        loss_dir=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=False,
+            loss_weight=0.2)),
+    # model training and testing settings
+    train_cfg=dict(
+        pts=dict(code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.25, 0.25])))
--- a/mmde/configs/free_anchor/pointpillars_hv_regnet-1.6gf_fpn_head-free-anchor_sbn-all_8xb4-2x_nus-3d.py
+++ b/mmde/configs/free_anchor/pointpillars_hv_regnet-1.6gf_fpn_head-free-anchor_sbn-all_8xb4-2x_nus-3d.py
+_base_ = './pointpillars_hv_fpn_head-free-anchor_sbn-all_8xb4-2x_nus-3d.py'
+model = dict(
+    pts_backbone=dict(
+        _delete_=True,
+        type='NoStemRegNet',
+        arch='regnetx_1.6gf',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_1.6gf'),
+        out_indices=(1, 2, 3),
+        frozen_stages=-1,
+        strides=(1, 2, 2, 2),
+        base_channels=64,
+        stem_channels=64,
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        norm_eval=False,
+        style='pytorch'),
+    pts_neck=dict(in_channels=[168, 408, 912]))
--- a/mmde/configs/free_anchor/pointpillars_hv_regnet-1.6gf_fpn_head-free-anchor_sbn-all_8xb4-strong-aug-3x_nus-3d.py
+++ b/mmde/configs/free_anchor/pointpillars_hv_regnet-1.6gf_fpn_head-free-anchor_sbn-all_8xb4-strong-aug-3x_nus-3d.py
+_base_ = './pointpillars_hv_fpn_head-free-anchor_sbn-all_8xb4-2x_nus-3d.py'
+model = dict(
+    pts_backbone=dict(
+        _delete_=True,
+        type='NoStemRegNet',
+        arch='regnetx_1.6gf',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_1.6gf'),
+        out_indices=(1, 2, 3),
+        frozen_stages=-1,
+        strides=(1, 2, 2, 2),
+        base_channels=64,
+        stem_channels=64,
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        norm_eval=False,
+        style='pytorch'),
+    pts_neck=dict(in_channels=[168, 408, 912]))
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-50, -50, -5, 50, 50, 3]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
+    'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
+]
+backend_args = None
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        backend_args=backend_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.7854, 0.7854],
+        scale_ratio_range=[0.95, 1.05],
+        translation_std=[0.2, 0.2, 0.2]),
+    dict(
+        type='RandomFlip3D',
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='PointShuffle'),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+train_cfg = dict(max_epochs=36, val_interval=36)
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0 / 1000,
+        by_epoch=False,
+        begin=0,
+        end=1000),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=24,
+        by_epoch=True,
+        milestones=[28, 34],
+        gamma=0.1)
+]
--- a/mmde/configs/free_anchor/pointpillars_hv_regnet-3.2gf_fpn_head-free-anchor_sbn-all_8xb4-2x_nus-3d.py
+++ b/mmde/configs/free_anchor/pointpillars_hv_regnet-3.2gf_fpn_head-free-anchor_sbn-all_8xb4-2x_nus-3d.py
+_base_ = './pointpillars_hv_fpn_head-free-anchor_sbn-all_8xb4-2x_nus-3d.py'
+model = dict(
+    pts_backbone=dict(
+        _delete_=True,
+        type='NoStemRegNet',
+        arch='regnetx_3.2gf',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_3.2gf'),
+        out_indices=(1, 2, 3),
+        frozen_stages=-1,
+        strides=(1, 2, 2, 2),
+        base_channels=64,
+        stem_channels=64,
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        norm_eval=False,
+        style='pytorch'),
+    pts_neck=dict(in_channels=[192, 432, 1008]))
--- a/mmde/configs/free_anchor/pointpillars_hv_regnet-3.2gf_fpn_head-free-anchor_sbn-all_8xb4-strong-aug-3x_nus-3d.py
+++ b/mmde/configs/free_anchor/pointpillars_hv_regnet-3.2gf_fpn_head-free-anchor_sbn-all_8xb4-strong-aug-3x_nus-3d.py
+_base_ = './pointpillars_hv_fpn_head-free-anchor_sbn-all_8xb4-2x_nus-3d.py'
+model = dict(
+    pts_backbone=dict(
+        _delete_=True,
+        type='NoStemRegNet',
+        arch='regnetx_3.2gf',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_3.2gf'),
+        out_indices=(1, 2, 3),
+        frozen_stages=-1,
+        strides=(1, 2, 2, 2),
+        base_channels=64,
+        stem_channels=64,
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        norm_eval=False,
+        style='pytorch'),
+    pts_neck=dict(in_channels=[192, 432, 1008]))
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-50, -50, -5, 50, 50, 3]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
+    'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
+]
+backend_args = None
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=10,
+        backend_args=backend_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.7854, 0.7854],
+        scale_ratio_range=[0.9, 1.1],
+        translation_std=[0.2, 0.2, 0.2]),
+    dict(
+        type='RandomFlip3D',
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='PointShuffle'),
+    dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+train_cfg = dict(max_epochs=36, val_interval=36)
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0 / 1000,
+        by_epoch=False,
+        begin=0,
+        end=1000),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=36,
+        by_epoch=True,
+        milestones=[28, 34],
+        gamma=0.1)
+]
--- a/mmde/configs/free_anchor/pointpillars_hv_regnet-400mf_fpn_head-free-anchor_sbn-all_8xb4-2x_nus-3d.py
+++ b/mmde/configs/free_anchor/pointpillars_hv_regnet-400mf_fpn_head-free-anchor_sbn-all_8xb4-2x_nus-3d.py
+_base_ = './pointpillars_hv_fpn_head-free-anchor_sbn-all_8xb4-2x_nus-3d.py'
+model = dict(
+    pts_backbone=dict(
+        _delete_=True,
+        type='NoStemRegNet',
+        arch='regnetx_400mf',
+        init_cfg=dict(
+            type='Pretrained', checkpoint='open-mmlab://regnetx_400mf'),
+        out_indices=(1, 2, 3),
+        frozen_stages=-1,
+        strides=(1, 2, 2, 2),
+        base_channels=64,
+        stem_channels=64,
+        norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
+        norm_eval=False,
+        style='pytorch'),
+    pts_neck=dict(in_channels=[64, 160, 384]))
--- a/mmde/configs/groupfree3d/README.md
+++ b/mmde/configs/groupfree3d/README.md
+# Group-Free 3D Object Detection via Transformers
+> [Group-Free 3D Object Detection via Transformers](https://arxiv.org/abs/2104.00678)
+<!-- [ALGORITHM] -->
+## Abstract
+Recently, directly detecting 3D objects from 3D point clouds has received increasing attention. To extract object representation from an irregular point cloud, existing methods usually take a point grouping step to assign the points to an object candidate so that a PointNet-like network could be used to derive object features from the grouped points. However, the inaccurate point assignments caused by the hand-crafted grouping scheme decrease the performance of 3D object detection. In this paper, we present a simple yet effective method for directly detecting 3D objects from the 3D point cloud. Instead of grouping local points to each object candidate, our method computes the feature of an object from all the points in the point cloud with the help of an attention mechanism in the Transformers, where the contribution of each point is automatically learned in the network training. With an improved attention stacking scheme, our method fuses object features in different stages and generates more accurate object detection results. With few bells and whistles, the proposed method achieves state-of-the-art 3D object detection performance on two widely used benchmarks, ScanNet V2 and SUN RGB-D.
+<div align=center>
+<img src="https://user-images.githubusercontent.com/36950400/143868101-09787c2a-9e0b-4013-8800-b4e315d535f0.png" width="800"/>
+</div>
+## Introduction
+We implement Group-Free-3D and provide the result and checkpoints on ScanNet datasets.
+## Results and models
+### ScanNet
+|                              Method                              |   Backbone    | Lr schd | Mem (GB) | Inf time (fps) |     AP@0.25     |     AP@0.5      |                                                                                                                                                                                                        Download                                                                                                                                                                                                        |
+| :--------------------------------------------------------------: | :-----------: | :-----: | :------: | :------------: | :-------------: | :-------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|    [L6, O256](./groupfree3d_head-L6-O256_4xb8_scannet-seg.py)    |  PointNet++   |   3x    |   6.7    |                | 66.17 (65.67\*) | 48.47 (47.74\*) |           [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/groupfree3d/groupfree3d_8x4_scannet-3d-18class-L6-O256/groupfree3d_8x4_scannet-3d-18class-L6-O256_20210702_145347-3499eb55.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/groupfree3d/groupfree3d_8x4_scannet-3d-18class-L6-O256/groupfree3d_8x4_scannet-3d-18class-L6-O256_20210702_145347.log.json)           |
+|   [L12, O256](./groupfree3d_head-L12-O256_4xb8_scannet-seg.py)   |  PointNet++   |   3x    |   9.4    |                | 66.57 (66.22\*) | 48.21 (48.95\*) |         [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/groupfree3d/groupfree3d_8x4_scannet-3d-18class-L12-O256/groupfree3d_8x4_scannet-3d-18class-L12-O256_20210702_150907-1c5551ad.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/groupfree3d/groupfree3d_8x4_scannet-3d-18class-L12-O256/groupfree3d_8x4_scannet-3d-18class-L12-O256_20210702_150907.log.json)         |
+| [L12, O256](./groupfree3d_w2x-head-L12-O256_4xb8_scannet-seg.py) | PointNet++w2x |   3x    |   13.3   |                | 68.20 (67.30\*) | 51.02 (50.44\*) | [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/groupfree3d/groupfree3d_8x4_scannet-3d-18class-w2x-L12-O256/groupfree3d_8x4_scannet-3d-18class-w2x-L12-O256_20210702_200301-944f0ac0.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/groupfree3d/groupfree3d_8x4_scannet-3d-18class-w2x-L12-O256/groupfree3d_8x4_scannet-3d-18class-w2x-L12-O256_20210702_200301.log.json) |
+| [L12, O512](./groupfree3d_w2x-head-L12-O512_4xb8_scannet-seg.py) | PointNet++w2x |   3x    |   18.8   |                | 68.22 (68.20\*) | 52.61 (51.31\*) | [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/groupfree3d/groupfree3d_8x4_scannet-3d-18class-w2x-L12-O512/groupfree3d_8x4_scannet-3d-18class-w2x-L12-O512_20210702_220204-187b71c7.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/groupfree3d/groupfree3d_8x4_scannet-3d-18class-w2x-L12-O512/groupfree3d_8x4_scannet-3d-18class-w2x-L12-O512_20210702_220204.log.json) |
+**Notes:**
+- We defined L6-O256 represent num_layers=6 and num_proposals=256. And w2x means that the model backbone weight is twice the original.
+- We report the best results (AP@0.50) on validation set during each training. * means the evaluation method in the paper: we train each setting 5 times and test each training trial 5 times, then the average performance of these 25 trials is reported to account for algorithm randomness.
+- We use 4 GPUs for training by default as the original code.
+## Citation
+```latex
+@article{liu2021,
+  title={Group-Free 3D Object Detection via Transformers},
+  author={Liu, Ze and Zhang, Zheng and Cao, Yue and Hu, Han and Tong, Xin},
+  journal={Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)},
+  year={2021}
+}
+```
--- a/mmde/configs/groupfree3d/groupfree3d_head-L12-O256_4xb8_scannet-seg.py
+++ b/mmde/configs/groupfree3d/groupfree3d_head-L12-O256_4xb8_scannet-seg.py
+_base_ = [
+    '../_base_/datasets/scannet-3d.py', '../_base_/models/groupfree3d.py',
+    '../_base_/schedules/schedule-3x.py', '../_base_/default_runtime.py'
+]
+# model settings
+model = dict(
+    bbox_head=dict(
+        num_classes=18,
+        num_decoder_layers=12,
+        size_cls_agnostic=False,
+        bbox_coder=dict(
+            type='GroupFree3DBBoxCoder',
+            num_sizes=18,
+            num_dir_bins=1,
+            with_rot=False,
+            size_cls_agnostic=False,
+            mean_sizes=[[0.76966727, 0.8116021, 0.92573744],
+                        [1.876858, 1.8425595, 1.1931566],
+                        [0.61328, 0.6148609, 0.7182701],
+                        [1.3955007, 1.5121545, 0.83443564],
+                        [0.97949594, 1.0675149, 0.6329687],
+                        [0.531663, 0.5955577, 1.7500148],
+                        [0.9624706, 0.72462326, 1.1481868],
+                        [0.83221924, 1.0490936, 1.6875663],
+                        [0.21132214, 0.4206159, 0.5372846],
+                        [1.4440073, 1.8970833, 0.26985747],
+                        [1.0294262, 1.4040797, 0.87554324],
+                        [1.3766412, 0.65521795, 1.6813129],
+                        [0.6650819, 0.71111923, 1.298853],
+                        [0.41999173, 0.37906948, 1.7513971],
+                        [0.59359556, 0.5912492, 0.73919016],
+                        [0.50867593, 0.50656086, 0.30136237],
+                        [1.1511526, 1.0546296, 0.49706793],
+                        [0.47535285, 0.49249494, 0.5802117]]),
+        sampling_objectness_loss=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=8.0),
+        objectness_loss=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        center_loss=dict(
+            type='mmdet.SmoothL1Loss',
+            beta=0.04,
+            reduction='sum',
+            loss_weight=10.0),
+        dir_class_loss=dict(
+            type='mmdet.CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        dir_res_loss=dict(
+            type='mmdet.SmoothL1Loss', reduction='sum', loss_weight=10.0),
+        size_class_loss=dict(
+            type='mmdet.CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        size_res_loss=dict(
+            type='mmdet.SmoothL1Loss',
+            beta=1.0 / 9.0,
+            reduction='sum',
+            loss_weight=10.0 / 9.0),
+        semantic_loss=dict(
+            type='mmdet.CrossEntropyLoss', reduction='sum', loss_weight=1.0)),
+    test_cfg=dict(
+        sample_mode='kps',
+        nms_thr=0.25,
+        score_thr=0.0,
+        per_class_proposal=True,
+        prediction_stages='last_three'))
+# dataset settings
+dataset_type = 'ScanNetDataset'
+data_root = './data/scannet/'
+class_names = ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window',
+               'bookshelf', 'picture', 'counter', 'desk', 'curtain',
+               'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub',
+               'garbagebin')
+metainfo = dict(classes=class_names)
+backend_args = None
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        load_dim=6,
+        use_dim=[0, 1, 2],
+        backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_mask_3d=True,
+        with_seg_3d=True,
+        backend_args=backend_args),
+    dict(type='GlobalAlignment', rotation_axis=2),
+    dict(type='PointSegClassMapping'),
+    dict(type='PointSample', num_points=50000),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.087266, 0.087266],
+        scale_ratio_range=[1.0, 1.0]),
+    dict(
+        type='Pack3DDetInputs',
+        keys=[
+            'points', 'gt_bboxes_3d', 'gt_labels_3d', 'pts_semantic_mask',
+            'pts_instance_mask'
+        ])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        load_dim=6,
+        use_dim=[0, 1, 2],
+        backend_args=backend_args),
+    dict(type='GlobalAlignment', rotation_axis=2),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(
+                type='RandomFlip3D',
+                sync_2d=False,
+                flip_ratio_bev_horizontal=0.5,
+                flip_ratio_bev_vertical=0.5),
+            dict(type='PointSample', num_points=50000),
+        ]),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=4,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=5,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='scannet_infos_train.pkl',
+            pipeline=train_pipeline,
+            filter_empty_gt=False,
+            metainfo=metainfo,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='Depth',
+            backend_args=backend_args)))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='scannet_infos_val.pkl',
+        pipeline=test_pipeline,
+        metainfo=metainfo,
+        test_mode=True,
+        box_type_3d='Depth',
+        backend_args=backend_args))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='scannet_infos_val.pkl',
+        pipeline=test_pipeline,
+        metainfo=metainfo,
+        test_mode=True,
+        box_type_3d='Depth',
+        backend_args=backend_args))
+val_evaluator = dict(type='IndoorMetric')
+test_evaluator = val_evaluator
+# optimizer
+lr = 0.006
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=lr, weight_decay=0.0005),
+    clip_grad=dict(max_norm=0.1, norm_type=2),
+    paramwise_cfg=dict(
+        custom_keys={
+            'bbox_head.decoder_layers': dict(lr_mult=0.1, decay_mult=1.0),
+            'bbox_head.decoder_self_posembeds': dict(
+                lr_mult=0.1, decay_mult=1.0),
+            'bbox_head.decoder_cross_posembeds': dict(
+                lr_mult=0.1, decay_mult=1.0),
+            'bbox_head.decoder_query_proj': dict(lr_mult=0.1, decay_mult=1.0),
+            'bbox_head.decoder_key_proj': dict(lr_mult=0.1, decay_mult=1.0)
+        }))
+# learning rate
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=80,
+        by_epoch=True,
+        milestones=[56, 68],
+        gamma=0.1)
+]
+# training schedule for 1x
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=80, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=10))
--- a/mmde/configs/groupfree3d/groupfree3d_head-L6-O256_4xb8_scannet-seg.py
+++ b/mmde/configs/groupfree3d/groupfree3d_head-L6-O256_4xb8_scannet-seg.py
+_base_ = [
+    '../_base_/datasets/scannet-3d.py', '../_base_/models/groupfree3d.py',
+    '../_base_/schedules/schedule-3x.py', '../_base_/default_runtime.py'
+]
+# model settings
+model = dict(
+    bbox_head=dict(
+        num_classes=18,
+        size_cls_agnostic=False,
+        bbox_coder=dict(
+            type='GroupFree3DBBoxCoder',
+            num_sizes=18,
+            num_dir_bins=1,
+            with_rot=False,
+            size_cls_agnostic=False,
+            mean_sizes=[[0.76966727, 0.8116021, 0.92573744],
+                        [1.876858, 1.8425595, 1.1931566],
+                        [0.61328, 0.6148609, 0.7182701],
+                        [1.3955007, 1.5121545, 0.83443564],
+                        [0.97949594, 1.0675149, 0.6329687],
+                        [0.531663, 0.5955577, 1.7500148],
+                        [0.9624706, 0.72462326, 1.1481868],
+                        [0.83221924, 1.0490936, 1.6875663],
+                        [0.21132214, 0.4206159, 0.5372846],
+                        [1.4440073, 1.8970833, 0.26985747],
+                        [1.0294262, 1.4040797, 0.87554324],
+                        [1.3766412, 0.65521795, 1.6813129],
+                        [0.6650819, 0.71111923, 1.298853],
+                        [0.41999173, 0.37906948, 1.7513971],
+                        [0.59359556, 0.5912492, 0.73919016],
+                        [0.50867593, 0.50656086, 0.30136237],
+                        [1.1511526, 1.0546296, 0.49706793],
+                        [0.47535285, 0.49249494, 0.5802117]]),
+        sampling_objectness_loss=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=8.0),
+        objectness_loss=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        center_loss=dict(
+            type='mmdet.SmoothL1Loss',
+            beta=0.04,
+            reduction='sum',
+            loss_weight=10.0),
+        dir_class_loss=dict(
+            type='mmdet.CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        dir_res_loss=dict(
+            type='mmdet.SmoothL1Loss', reduction='sum', loss_weight=10.0),
+        size_class_loss=dict(
+            type='mmdet.CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        size_res_loss=dict(
+            type='mmdet.SmoothL1Loss',
+            beta=1.0 / 9.0,
+            reduction='sum',
+            loss_weight=10.0 / 9.0),
+        semantic_loss=dict(
+            type='mmdet.CrossEntropyLoss', reduction='sum', loss_weight=1.0)),
+    test_cfg=dict(
+        sample_mode='kps',
+        nms_thr=0.25,
+        score_thr=0.0,
+        per_class_proposal=True,
+        prediction_stages='last_three'))
+# dataset settings
+dataset_type = 'ScanNetDataset'
+data_root = './data/scannet/'
+class_names = ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window',
+               'bookshelf', 'picture', 'counter', 'desk', 'curtain',
+               'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub',
+               'garbagebin')
+metainfo = dict(classes=class_names)
+backend_args = None
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        load_dim=6,
+        use_dim=[0, 1, 2],
+        backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_mask_3d=True,
+        with_seg_3d=True,
+        backend_args=backend_args),
+    dict(type='GlobalAlignment', rotation_axis=2),
+    dict(type='PointSegClassMapping'),
+    dict(type='PointSample', num_points=50000),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.087266, 0.087266],
+        scale_ratio_range=[1.0, 1.0]),
+    dict(
+        type='Pack3DDetInputs',
+        keys=[
+            'points', 'gt_bboxes_3d', 'gt_labels_3d', 'pts_semantic_mask',
+            'pts_instance_mask'
+        ])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        load_dim=6,
+        use_dim=[0, 1, 2],
+        backend_args=backend_args),
+    dict(type='GlobalAlignment', rotation_axis=2),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(
+                type='RandomFlip3D',
+                sync_2d=False,
+                flip_ratio_bev_horizontal=0.5,
+                flip_ratio_bev_vertical=0.5),
+            dict(type='PointSample', num_points=50000),
+        ]),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=4,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=5,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='scannet_infos_train.pkl',
+            pipeline=train_pipeline,
+            filter_empty_gt=False,
+            metainfo=metainfo,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='Depth',
+            backend_args=backend_args)))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='scannet_infos_val.pkl',
+        pipeline=test_pipeline,
+        metainfo=metainfo,
+        test_mode=True,
+        box_type_3d='Depth',
+        backend_args=backend_args))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='scannet_infos_val.pkl',
+        pipeline=test_pipeline,
+        metainfo=metainfo,
+        test_mode=True,
+        box_type_3d='Depth',
+        backend_args=backend_args))
+val_evaluator = dict(type='IndoorMetric')
+test_evaluator = val_evaluator
+# optimizer
+lr = 0.006
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=lr, weight_decay=0.0005),
+    clip_grad=dict(max_norm=0.1, norm_type=2),
+    paramwise_cfg=dict(
+        custom_keys={
+            'bbox_head.decoder_layers': dict(lr_mult=0.1, decay_mult=1.0),
+            'bbox_head.decoder_self_posembeds': dict(
+                lr_mult=0.1, decay_mult=1.0),
+            'bbox_head.decoder_cross_posembeds': dict(
+                lr_mult=0.1, decay_mult=1.0),
+            'bbox_head.decoder_query_proj': dict(lr_mult=0.1, decay_mult=1.0),
+            'bbox_head.decoder_key_proj': dict(lr_mult=0.1, decay_mult=1.0)
+        }))
+# learning rate
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=80,
+        by_epoch=True,
+        milestones=[56, 68],
+        gamma=0.1)
+]
+# training schedule for 1x
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=80, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=10))
+randomness = dict(seed=4)
--- a/mmde/configs/groupfree3d/groupfree3d_w2x-head-L12-O256_4xb8_scannet-seg.py
+++ b/mmde/configs/groupfree3d/groupfree3d_w2x-head-L12-O256_4xb8_scannet-seg.py
+_base_ = [
+    '../_base_/datasets/scannet-3d.py', '../_base_/models/groupfree3d.py',
+    '../_base_/schedules/schedule-3x.py', '../_base_/default_runtime.py'
+]
+# model settings
+model = dict(
+    backbone=dict(
+        type='PointNet2SASSG',
+        in_channels=3,
+        num_points=(2048, 1024, 512, 256),
+        radius=(0.2, 0.4, 0.8, 1.2),
+        num_samples=(64, 32, 16, 16),
+        sa_channels=((128, 128, 256), (256, 256, 512), (256, 256, 512),
+                     (256, 256, 512)),
+        fp_channels=((512, 512), (512, 288)),
+        norm_cfg=dict(type='BN2d'),
+        sa_cfg=dict(
+            type='PointSAModule',
+            pool_mod='max',
+            use_xyz=True,
+            normalize_xyz=True)),
+    bbox_head=dict(
+        num_classes=18,
+        num_decoder_layers=12,
+        size_cls_agnostic=False,
+        bbox_coder=dict(
+            type='GroupFree3DBBoxCoder',
+            num_sizes=18,
+            num_dir_bins=1,
+            with_rot=False,
+            size_cls_agnostic=False,
+            mean_sizes=[[0.76966727, 0.8116021, 0.92573744],
+                        [1.876858, 1.8425595, 1.1931566],
+                        [0.61328, 0.6148609, 0.7182701],
+                        [1.3955007, 1.5121545, 0.83443564],
+                        [0.97949594, 1.0675149, 0.6329687],
+                        [0.531663, 0.5955577, 1.7500148],
+                        [0.9624706, 0.72462326, 1.1481868],
+                        [0.83221924, 1.0490936, 1.6875663],
+                        [0.21132214, 0.4206159, 0.5372846],
+                        [1.4440073, 1.8970833, 0.26985747],
+                        [1.0294262, 1.4040797, 0.87554324],
+                        [1.3766412, 0.65521795, 1.6813129],
+                        [0.6650819, 0.71111923, 1.298853],
+                        [0.41999173, 0.37906948, 1.7513971],
+                        [0.59359556, 0.5912492, 0.73919016],
+                        [0.50867593, 0.50656086, 0.30136237],
+                        [1.1511526, 1.0546296, 0.49706793],
+                        [0.47535285, 0.49249494, 0.5802117]]),
+        sampling_objectness_loss=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=8.0),
+        objectness_loss=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        center_loss=dict(
+            type='mmdet.SmoothL1Loss',
+            beta=0.04,
+            reduction='sum',
+            loss_weight=10.0),
+        dir_class_loss=dict(
+            type='mmdet.CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        dir_res_loss=dict(
+            type='mmdet.SmoothL1Loss', reduction='sum', loss_weight=10.0),
+        size_class_loss=dict(
+            type='mmdet.CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        size_res_loss=dict(
+            type='mmdet.SmoothL1Loss',
+            beta=1.0 / 9.0,
+            reduction='sum',
+            loss_weight=10.0 / 9.0),
+        semantic_loss=dict(
+            type='mmdet.CrossEntropyLoss', reduction='sum', loss_weight=1.0)),
+    test_cfg=dict(
+        sample_mode='kps',
+        nms_thr=0.25,
+        score_thr=0.0,
+        per_class_proposal=True,
+        prediction_stages='last_three'))
+# dataset settings
+dataset_type = 'ScanNetDataset'
+data_root = './data/scannet/'
+class_names = ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window',
+               'bookshelf', 'picture', 'counter', 'desk', 'curtain',
+               'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub',
+               'garbagebin')
+metainfo = dict(classes=class_names)
+backend_args = None
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        load_dim=6,
+        use_dim=[0, 1, 2],
+        backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_mask_3d=True,
+        with_seg_3d=True,
+        backend_args=backend_args),
+    dict(type='GlobalAlignment', rotation_axis=2),
+    dict(type='PointSegClassMapping'),
+    dict(type='PointSample', num_points=50000),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.087266, 0.087266],
+        scale_ratio_range=[1.0, 1.0]),
+    dict(
+        type='Pack3DDetInputs',
+        keys=[
+            'points', 'gt_bboxes_3d', 'gt_labels_3d', 'pts_semantic_mask',
+            'pts_instance_mask'
+        ])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        load_dim=6,
+        use_dim=[0, 1, 2],
+        backend_args=backend_args),
+    dict(type='GlobalAlignment', rotation_axis=2),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(
+                type='RandomFlip3D',
+                sync_2d=False,
+                flip_ratio_bev_horizontal=0.5,
+                flip_ratio_bev_vertical=0.5),
+            dict(type='PointSample', num_points=50000),
+        ]),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=4,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=5,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='scannet_infos_train.pkl',
+            pipeline=train_pipeline,
+            filter_empty_gt=False,
+            metainfo=metainfo,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='Depth',
+            backend_args=backend_args)))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='scannet_infos_val.pkl',
+        pipeline=test_pipeline,
+        metainfo=metainfo,
+        test_mode=True,
+        box_type_3d='Depth',
+        backend_args=backend_args))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='scannet_infos_val.pkl',
+        pipeline=test_pipeline,
+        metainfo=metainfo,
+        test_mode=True,
+        box_type_3d='Depth',
+        backend_args=backend_args))
+val_evaluator = dict(type='IndoorMetric')
+test_evaluator = val_evaluator
+# optimizer
+lr = 0.006
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=lr, weight_decay=0.0005),
+    clip_grad=dict(max_norm=0.1, norm_type=2),
+    paramwise_cfg=dict(
+        custom_keys={
+            'bbox_head.decoder_layers': dict(lr_mult=0.1, decay_mult=1.0),
+            'bbox_head.decoder_self_posembeds': dict(
+                lr_mult=0.1, decay_mult=1.0),
+            'bbox_head.decoder_cross_posembeds': dict(
+                lr_mult=0.1, decay_mult=1.0),
+            'bbox_head.decoder_query_proj': dict(lr_mult=0.1, decay_mult=1.0),
+            'bbox_head.decoder_key_proj': dict(lr_mult=0.1, decay_mult=1.0)
+        }))
+# learning rate
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=80,
+        by_epoch=True,
+        milestones=[56, 68],
+        gamma=0.1)
+]
+# training schedule for 1x
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=80, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=10))
--- a/mmde/configs/groupfree3d/groupfree3d_w2x-head-L12-O512_4xb8_scannet-seg.py
+++ b/mmde/configs/groupfree3d/groupfree3d_w2x-head-L12-O512_4xb8_scannet-seg.py
+_base_ = [
+    '../_base_/datasets/scannet-3d.py', '../_base_/models/groupfree3d.py',
+    '../_base_/schedules/schedule-3x.py', '../_base_/default_runtime.py'
+]
+# model settings
+model = dict(
+    backbone=dict(
+        type='PointNet2SASSG',
+        in_channels=3,
+        num_points=(2048, 1024, 512, 256),
+        radius=(0.2, 0.4, 0.8, 1.2),
+        num_samples=(64, 32, 16, 16),
+        sa_channels=((128, 128, 256), (256, 256, 512), (256, 256, 512),
+                     (256, 256, 512)),
+        fp_channels=((512, 512), (512, 288)),
+        norm_cfg=dict(type='BN2d'),
+        sa_cfg=dict(
+            type='PointSAModule',
+            pool_mod='max',
+            use_xyz=True,
+            normalize_xyz=True)),
+    bbox_head=dict(
+        num_classes=18,
+        num_decoder_layers=12,
+        num_proposal=512,
+        size_cls_agnostic=False,
+        bbox_coder=dict(
+            type='GroupFree3DBBoxCoder',
+            num_sizes=18,
+            num_dir_bins=1,
+            with_rot=False,
+            size_cls_agnostic=False,
+            mean_sizes=[[0.76966727, 0.8116021, 0.92573744],
+                        [1.876858, 1.8425595, 1.1931566],
+                        [0.61328, 0.6148609, 0.7182701],
+                        [1.3955007, 1.5121545, 0.83443564],
+                        [0.97949594, 1.0675149, 0.6329687],
+                        [0.531663, 0.5955577, 1.7500148],
+                        [0.9624706, 0.72462326, 1.1481868],
+                        [0.83221924, 1.0490936, 1.6875663],
+                        [0.21132214, 0.4206159, 0.5372846],
+                        [1.4440073, 1.8970833, 0.26985747],
+                        [1.0294262, 1.4040797, 0.87554324],
+                        [1.3766412, 0.65521795, 1.6813129],
+                        [0.6650819, 0.71111923, 1.298853],
+                        [0.41999173, 0.37906948, 1.7513971],
+                        [0.59359556, 0.5912492, 0.73919016],
+                        [0.50867593, 0.50656086, 0.30136237],
+                        [1.1511526, 1.0546296, 0.49706793],
+                        [0.47535285, 0.49249494, 0.5802117]]),
+        sampling_objectness_loss=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=8.0),
+        objectness_loss=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        center_loss=dict(
+            type='mmdet.SmoothL1Loss',
+            beta=0.04,
+            reduction='sum',
+            loss_weight=10.0),
+        dir_class_loss=dict(
+            type='mmdet.CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        dir_res_loss=dict(
+            type='mmdet.SmoothL1Loss', reduction='sum', loss_weight=10.0),
+        size_class_loss=dict(
+            type='mmdet.CrossEntropyLoss', reduction='sum', loss_weight=1.0),
+        size_res_loss=dict(
+            type='mmdet.SmoothL1Loss',
+            beta=1.0 / 9.0,
+            reduction='sum',
+            loss_weight=10.0 / 9.0),
+        semantic_loss=dict(
+            type='mmdet.CrossEntropyLoss', reduction='sum', loss_weight=1.0)),
+    test_cfg=dict(
+        sample_mode='kps',
+        nms_thr=0.25,
+        score_thr=0.0,
+        per_class_proposal=True,
+        prediction_stages='last_three'))
+# dataset settings
+dataset_type = 'ScanNetDataset'
+data_root = './data/scannet/'
+class_names = ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window',
+               'bookshelf', 'picture', 'counter', 'desk', 'curtain',
+               'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub',
+               'garbagebin')
+metainfo = dict(classes=class_names)
+backend_args = None
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        load_dim=6,
+        use_dim=[0, 1, 2],
+        backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_mask_3d=True,
+        with_seg_3d=True,
+        backend_args=backend_args),
+    dict(type='GlobalAlignment', rotation_axis=2),
+    dict(type='PointSegClassMapping'),
+    dict(type='PointSample', num_points=50000),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.087266, 0.087266],
+        scale_ratio_range=[1.0, 1.0]),
+    dict(
+        type='Pack3DDetInputs',
+        keys=[
+            'points', 'gt_bboxes_3d', 'gt_labels_3d', 'pts_semantic_mask',
+            'pts_instance_mask'
+        ])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        load_dim=6,
+        use_dim=[0, 1, 2],
+        backend_args=backend_args),
+    dict(type='GlobalAlignment', rotation_axis=2),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(
+                type='RandomFlip3D',
+                sync_2d=False,
+                flip_ratio_bev_horizontal=0.5,
+                flip_ratio_bev_vertical=0.5),
+            dict(type='PointSample', num_points=50000),
+        ]),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=4,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=5,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='scannet_infos_train.pkl',
+            pipeline=train_pipeline,
+            filter_empty_gt=False,
+            metainfo=metainfo,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='Depth',
+            backend_args=backend_args)))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='scannet_infos_val.pkl',
+        pipeline=test_pipeline,
+        metainfo=metainfo,
+        test_mode=True,
+        box_type_3d='Depth',
+        backend_args=backend_args))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='scannet_infos_val.pkl',
+        pipeline=test_pipeline,
+        metainfo=metainfo,
+        test_mode=True,
+        box_type_3d='Depth',
+        backend_args=backend_args))
+val_evaluator = dict(type='IndoorMetric')
+test_evaluator = val_evaluator
+# optimizer
+lr = 0.006
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=lr, weight_decay=0.0005),
+    clip_grad=dict(max_norm=0.1, norm_type=2),
+    paramwise_cfg=dict(
+        custom_keys={
+            'bbox_head.decoder_layers': dict(lr_mult=0.1, decay_mult=1.0),
+            'bbox_head.decoder_self_posembeds': dict(
+                lr_mult=0.1, decay_mult=1.0),
+            'bbox_head.decoder_cross_posembeds': dict(
+                lr_mult=0.1, decay_mult=1.0),
+            'bbox_head.decoder_query_proj': dict(lr_mult=0.1, decay_mult=1.0),
+            'bbox_head.decoder_key_proj': dict(lr_mult=0.1, decay_mult=1.0)
+        }))
+# learning rate
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=80,
+        by_epoch=True,
+        milestones=[56, 68],
+        gamma=0.1)
+]
+# training schedule for 1x
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=80, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=10))
--- a/mmde/configs/groupfree3d/metafile.yml
+++ b/mmde/configs/groupfree3d/metafile.yml
+Collections:
+  - Name: Group-Free-3D
+    Metadata:
+      Training Techniques:
+        - AdamW
+      Training Resources: 4x V100 GPUs
+      Architecture:
+        - PointNet++
+    Paper:
+      URL: https://arxiv.org/abs/2104.00678
+      Title: 'Group-Free 3D Object Detection via Transformers'
+    README: configs/groupfree3d/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/models/detectors/groupfree3dnet.py#L10
+      Version: v0.15.0
+Models:
+  - Name: groupfree3d_head-L6-O256_4xb8_scannet-seg.py
+    In Collection: Group-Free-3D
+    Config: configs/groupfree3d/groupfree3d_head-L6-O256_4xb8_scannet-seg.py
+    Metadata:
+      Training Data: ScanNet
+      Training Memory (GB): 6.7
+    Results:
+      - Task: 3D Object Detection
+        Dataset: ScanNet
+        Metrics:
+          AP@0.25: 66.17
+          AP@0.5: 48.47
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/groupfree3d/groupfree3d_8x4_scannet-3d-18class-L6-O256/groupfree3d_8x4_scannet-3d-18class-L6-O256_20210702_145347-3499eb55.pth
+  - Name: groupfree3d_head-L12-O256_4xb8_scannet-seg.py
+    In Collection: Group-Free-3D
+    Config: configs/groupfree3d/groupfree3d_head-L12-O256_4xb8_scannet-seg.py
+    Metadata:
+      Training Data: ScanNet
+      Training Memory (GB): 9.4
+    Results:
+      - Task: 3D Object Detection
+        Dataset: ScanNet
+        Metrics:
+          AP@0.25: 66.57
+          AP@0.5: 48.21
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/groupfree3d/groupfree3d_8x4_scannet-3d-18class-L12-O256/groupfree3d_8x4_scannet-3d-18class-L12-O256_20210702_150907-1c5551ad.pth
+  - Name: groupfree3d_w2x-head-L12-O256_4xb8_scannet-seg.py
+    In Collection: Group-Free-3D
+    Config: configs/groupfree3d/groupfree3d_w2x-head-L12-O256_4xb8_scannet-seg.py
+    Metadata:
+      Training Data: ScanNet
+      Training Memory (GB): 13.3
+    Results:
+      - Task: 3D Object Detection
+        Dataset: ScanNet
+        Metrics:
+          AP@0.25: 68.20
+          AP@0.5: 51.02
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/groupfree3d/groupfree3d_8x4_scannet-3d-18class-w2x-L12-O256/groupfree3d_8x4_scannet-3d-18class-w2x-L12-O256_20210702_200301-944f0ac0.pth
+  - Name: groupfree3d_w2x-head-L12-O512_4xb8_scannet-seg.py
+    In Collection: Group-Free-3D
+    Config: configs/groupfree3d/groupfree3d_w2x-head-L12-O512_4xb8_scannet-seg.py
+    Metadata:
+      Training Data: ScanNet
+      Training Memory (GB): 18.8
+    Results:
+      - Task: 3D Object Detection
+        Dataset: ScanNet
+        Metrics:
+          AP@0.25: 68.22
+          AP@0.5: 52.61
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/groupfree3d/groupfree3d_8x4_scannet-3d-18class-w2x-L12-O512/groupfree3d_8x4_scannet-3d-18class-w2x-L12-O512_20210702_220204-187b71c7.pth
--- a/mmde/configs/h3dnet/README.md
+++ b/mmde/configs/h3dnet/README.md
+# H3DNet: 3D Object Detection Using Hybrid Geometric Primitives
+> [H3DNet: 3D Object Detection Using Hybrid Geometric Primitives](https://arxiv.org/abs/2006.05682)
+<!-- [ALGORITHM] -->
+## Abstract
+We introduce H3DNet, which takes a colorless 3D point cloud as input and outputs a collection of oriented object bounding boxes (or BB) and their semantic labels. The critical idea of H3DNet is to predict a hybrid set of geometric primitives, i.e., BB centers, BB face centers, and BB edge centers. We show how to convert the predicted geometric primitives into object proposals by defining a distance function between an object and the geometric primitives. This distance function enables continuous optimization of object proposals, and its local minimums provide high-fidelity object proposals. H3DNet then utilizes a matching and refinement module to classify object proposals into detected objects and fine-tune the geometric parameters of the detected objects. The hybrid set of geometric primitives not only provides more accurate signals for object detection than using a single type of geometric primitives, but it also provides an overcomplete set of constraints on the resulting 3D layout. Therefore, H3DNet can tolerate outliers in predicted geometric primitives. Our model achieves state-of-the-art 3D detection results on two large datasets with real 3D scans, ScanNet and SUN RGB-D.
+<div align=center>
+<img src="https://user-images.githubusercontent.com/36950400/143868884-26f7fc63-93fd-48cb-a469-e2f55fda5550.png" width="800"/>
+</div>
+## Introduction
+We implement H3DNet and provide the result and checkpoints on ScanNet datasets.
+## Results and models
+### ScanNet
+|                   Backbone                    | Lr schd | Mem (GB) | Inf time (fps) | AP@0.25 | AP@0.5 |                                                                                                                                                             Download                                                                                                                                                             |
+| :-------------------------------------------: | :-----: | :------: | :------------: | :-----: | :----: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [MultiBackbone](./h3dnet_8xb3_scannet-seg.py) |   3x    |   7.9    |                |  66.07  | 47.68  | [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/h3dnet/h3dnet_3x8_scannet-3d-18class/h3dnet_3x8_scannet-3d-18class_20210824_003149-414bd304.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/h3dnet/h3dnet_scannet-3d-18class/h3dnet_3x8_scannet-3d-18class_20210824_003149.log.json) |
+**Notice**: If your current mmdetection3d version >= 0.6.0, and you are using the checkpoints downloaded from the above links or using checkpoints trained with mmdetection3d version \< 0.6.0, the checkpoints have to be first converted via [tools/model_converters/convert_h3dnet_checkpoints.py](../../tools/model_converters/convert_h3dnet_checkpoints.py):
+```
+python ./tools/model_converters/convert_h3dnet_checkpoints.py ${ORIGINAL_CHECKPOINT_PATH} --out=${NEW_CHECKPOINT_PATH}
+```
+Then you can use the converted checkpoints following [get_started.md](../../docs/en/get_started.md).
+## Citation
+```latex
+@inproceedings{zhang2020h3dnet,
+    author = {Zhang, Zaiwei and Sun, Bo and Yang, Haitao and Huang, Qixing},
+    title = {H3DNet: 3D Object Detection Using Hybrid Geometric Primitives},
+    booktitle = {Proceedings of the European Conference on Computer Vision},
+    year = {2020}
+}
+```
--- a/mmde/configs/h3dnet/h3dnet_8xb3_scannet-seg.py
+++ b/mmde/configs/h3dnet/h3dnet_8xb3_scannet-seg.py
+_base_ = [
+    '../_base_/datasets/scannet-3d.py', '../_base_/models/h3dnet.py',
+    '../_base_/schedules/schedule-3x.py', '../_base_/default_runtime.py'
+]
+# model settings
+model = dict(
+    rpn_head=dict(
+        num_classes=18,
+        bbox_coder=dict(
+            type='PartialBinBasedBBoxCoder',
+            num_sizes=18,
+            num_dir_bins=24,
+            with_rot=False,
+            mean_sizes=[[0.76966727, 0.8116021, 0.92573744],
+                        [1.876858, 1.8425595, 1.1931566],
+                        [0.61328, 0.6148609, 0.7182701],
+                        [1.3955007, 1.5121545, 0.83443564],
+                        [0.97949594, 1.0675149, 0.6329687],
+                        [0.531663, 0.5955577, 1.7500148],
+                        [0.9624706, 0.72462326, 1.1481868],
+                        [0.83221924, 1.0490936, 1.6875663],
+                        [0.21132214, 0.4206159, 0.5372846],
+                        [1.4440073, 1.8970833, 0.26985747],
+                        [1.0294262, 1.4040797, 0.87554324],
+                        [1.3766412, 0.65521795, 1.6813129],
+                        [0.6650819, 0.71111923, 1.298853],
+                        [0.41999173, 0.37906948, 1.7513971],
+                        [0.59359556, 0.5912492, 0.73919016],
+                        [0.50867593, 0.50656086, 0.30136237],
+                        [1.1511526, 1.0546296, 0.49706793],
+                        [0.47535285, 0.49249494, 0.5802117]])),
+    roi_head=dict(
+        bbox_head=dict(
+            num_classes=18,
+            bbox_coder=dict(
+                type='PartialBinBasedBBoxCoder',
+                num_sizes=18,
+                num_dir_bins=24,
+                with_rot=False,
+                mean_sizes=[[0.76966727, 0.8116021, 0.92573744],
+                            [1.876858, 1.8425595, 1.1931566],
+                            [0.61328, 0.6148609, 0.7182701],
+                            [1.3955007, 1.5121545, 0.83443564],
+                            [0.97949594, 1.0675149, 0.6329687],
+                            [0.531663, 0.5955577, 1.7500148],
+                            [0.9624706, 0.72462326, 1.1481868],
+                            [0.83221924, 1.0490936, 1.6875663],
+                            [0.21132214, 0.4206159, 0.5372846],
+                            [1.4440073, 1.8970833, 0.26985747],
+                            [1.0294262, 1.4040797, 0.87554324],
+                            [1.3766412, 0.65521795, 1.6813129],
+                            [0.6650819, 0.71111923, 1.298853],
+                            [0.41999173, 0.37906948, 1.7513971],
+                            [0.59359556, 0.5912492, 0.73919016],
+                            [0.50867593, 0.50656086, 0.30136237],
+                            [1.1511526, 1.0546296, 0.49706793],
+                            [0.47535285, 0.49249494, 0.5802117]]))))
+train_dataloader = dict(
+    batch_size=3,
+    num_workers=2,
+)
+# yapf:disable
+default_hooks = dict(
+    logger=dict(type='LoggerHook', interval=30)
+)
+# yapf:enable
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (3 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=24)
--- a/mmde/configs/h3dnet/metafile.yml
+++ b/mmde/configs/h3dnet/metafile.yml
+Collections:
+  - Name: H3DNet
+    Metadata:
+      Training Data: ScanNet
+      Training Techniques:
+        - AdamW
+      Training Resources: 8x GeForce GTX 1080 Ti
+      Architecture:
+    Paper:
+      URL: https://arxiv.org/abs/2006.05682
+      Title: 'H3DNet: 3D Object Detection Using Hybrid Geometric Primitives'
+    README: configs/h3dnet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/models/detectors/h3dnet.py#L10
+      Version: v0.6.0
+Models:
+  - Name: h3dnet_3x8_scannet-3d-18class
+    In Collection: H3DNet
+    Config: configs/h3dnet/h3dnet_8xb3_scannet-seg.py
+    Metadata:
+      Training Memory (GB): 7.9
+    Results:
+      - Task: 3D Object Detection
+        Dataset: ScanNet
+        Metrics:
+          AP@0.25: 66.07
+          AP@0.5: 47.68
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/h3dnet/h3dnet_3x8_scannet-3d-18class/h3dnet_3x8_scannet-3d-18class_20210824_003149-414bd304.pth
--- a/mmde/configs/imvotenet/README.md
+++ b/mmde/configs/imvotenet/README.md
+# ImVoteNet: Boosting 3D Object Detection in Point Clouds with Image Votes
+> [ImVoteNet: Boosting 3D Object Detection in Point Clouds with Image Votes](https://arxiv.org/abs/2001.10692)
+<!-- [ALGORITHM] -->
+## Abstract
+3D object detection has seen quick progress thanks to advances in deep learning on point clouds. A few recent works have even shown state-of-the-art performance with just point clouds input (e.g. VOTENET). However, point cloud data have inherent limitations. They are sparse, lack color information and often suffer from sensor noise. Images, on the other hand, have high resolution and rich texture. Thus they can complement the 3D geometry provided by point clouds. Yet how to effectively use image information to assist point cloud based detection is still an open question. In this work, we build on top of VOTENET and propose a 3D detection architecture called IMVOTENET specialized for RGB-D scenes. IMVOTENET is based on fusing 2D votes in images and 3D votes in point clouds. Compared to prior work on multi-modal detection, we explicitly extract both geometric and semantic features from the 2D images. We leverage camera parameters to lift these features to 3D. To improve the synergy of 2D-3D feature fusion, we also propose a multi-tower training scheme. We validate our model on the challenging SUN RGB-D dataset, advancing state-of-the-art results by 5.7 mAP. We also provide rich ablation studies to analyze the contribution of each design choice.
+<div align=center>
+<img src="https://user-images.githubusercontent.com/36950400/143869878-a2ae7f43-55c3-4b95-af09-8f97dfd975f4.png" width="800"/>
+</div>
+## Introduction
+We implement ImVoteNet and provide the result and checkpoints on SUNRGBD.
+## Results and models
+### SUNRGBD-2D (Stage 1, image branch pre-train)
+|                             Backbone                             | Lr schd | Mem (GB) | Inf time (fps) | AP@0.25 | AP@0.5 |                                                                                                                                                                                                              Download                                                                                                                                                                                                              |
+| :--------------------------------------------------------------: | :-----: | :------: | :------------: | :-----: | :----: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [PointNet++](./imvotenet_faster-rcnn-r50_fpn_4xb2_sunrgbd-3d.py) |         |   2.1    |                |         | 62.70  | [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/imvotenet/imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class/imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class_20210819_225618-62eba6ce.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/imvotenet/imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class/imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class_20210819_225618.json) |
+### SUNRGBD-3D (Stage 2)
+|                       Backbone                       | Lr schd | Mem (GB) | Inf time (fps) | AP@0.25 | AP@0.5 |                                                                                                                                                                                        Download                                                                                                                                                                                        |
+| :--------------------------------------------------: | :-----: | :------: | :------------: | :-----: | :----: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [PointNet++](./imvotenet_stage2_8xb16_sunrgbd-3d.py) |   3x    |   9.4    |                |  64.48  |        | [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/imvotenet/imvotenet_stage2_16x8_sunrgbd-3d-10class/imvotenet_stage2_16x8_sunrgbd-3d-10class_20210819_192851-1bcd1b97.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/imvotenet/imvotenet_stage2_16x8_sunrgbd-3d-10class/imvotenet_stage2_16x8_sunrgbd-3d-10class_20210819_192851.log.json) |
+## Citation
+```latex
+@inproceedings{qi2020imvotenet,
+  title={Imvotenet: Boosting 3D object detection in point clouds with image votes},
+  author={Qi, Charles R and Chen, Xinlei and Litany, Or and Guibas, Leonidas J},
+  booktitle={Proceedings of the IEEE/CVF conference on computer vision and pattern recognition},
+  pages={4404--4413},
+  year={2020}
+}
+```
--- a/mmde/configs/imvotenet/imvotenet_faster-rcnn-r50_fpn_4xb2_sunrgbd-3d.py
+++ b/mmde/configs/imvotenet/imvotenet_faster-rcnn-r50_fpn_4xb2_sunrgbd-3d.py
+_base_ = [
+    '../_base_/datasets/sunrgbd-3d.py', '../_base_/default_runtime.py',
+    '../_base_/models/imvotenet.py'
+]
+backend_args = None
+train_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox=True,
+        with_label=True,
+        with_bbox_3d=False,
+        with_label_3d=False),
+    dict(
+        type='RandomChoiceResize',
+        scales=[(1333, 480), (1333, 504), (1333, 528), (1333, 552),
+                (1333, 576), (1333, 600)],
+        keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='Pack3DDetInputs', keys=['img', 'gt_bboxes', 'gt_bboxes_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(type='Resize', scale=(1333, 600), keep_ratio=True),
+    dict(
+        type='Pack3DDetInputs',
+        keys=(['img']),
+        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
+                   'scale_factor'))
+]
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset', times=1, dataset=dict(pipeline=train_pipeline)))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=8, val_interval=1)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0, end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=8,
+        by_epoch=True,
+        milestones=[6],
+        gamma=0.1)
+]
+val_evaluator = dict(type='Indoor2DMetric')
+test_evaluator = val_evaluator
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001))
+load_from = 'http://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco_bbox_mAP-0.408__segm_mAP-0.37_20200504_163245-42aa3d00.pth'  # noqa
--- a/mmde/configs/imvotenet/imvotenet_stage2_8xb16_sunrgbd-3d.py
+++ b/mmde/configs/imvotenet/imvotenet_stage2_8xb16_sunrgbd-3d.py
+_base_ = [
+    '../_base_/datasets/sunrgbd-3d.py', '../_base_/schedules/schedule-3x.py',
+    '../_base_/default_runtime.py', '../_base_/models/imvotenet.py'
+]
+class_names = ('bed', 'table', 'sofa', 'chair', 'toilet', 'desk', 'dresser',
+               'night_stand', 'bookshelf', 'bathtub')
+backend_args = None
+model = dict(
+    pts_backbone=dict(
+        type='PointNet2SASSG',
+        in_channels=4,
+        num_points=(2048, 1024, 512, 256),
+        radius=(0.2, 0.4, 0.8, 1.2),
+        num_samples=(64, 32, 16, 16),
+        sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256),
+                     (128, 128, 256)),
+        fp_channels=((256, 256), (256, 256)),
+        norm_cfg=dict(type='BN2d'),
+        sa_cfg=dict(
+            type='PointSAModule',
+            pool_mod='max',
+            use_xyz=True,
+            normalize_xyz=True)),
+    pts_bbox_heads=dict(
+        common=dict(
+            type='VoteHead',
+            num_classes=10,
+            bbox_coder=dict(
+                type='PartialBinBasedBBoxCoder',
+                num_sizes=10,
+                num_dir_bins=12,
+                with_rot=True,
+                mean_sizes=[[2.114256, 1.620300, 0.927272],
+                            [0.791118, 1.279516, 0.718182],
+                            [0.923508, 1.867419, 0.845495],
+                            [0.591958, 0.552978, 0.827272],
+                            [0.699104, 0.454178, 0.75625],
+                            [0.69519, 1.346299, 0.736364],
+                            [0.528526, 1.002642, 1.172878],
+                            [0.500618, 0.632163, 0.683424],
+                            [0.404671, 1.071108, 1.688889],
+                            [0.76584, 1.398258, 0.472728]]),
+            pred_layer_cfg=dict(
+                in_channels=128, shared_conv_channels=(128, 128), bias=True),
+            objectness_loss=dict(
+                type='mmdet.CrossEntropyLoss',
+                class_weight=[0.2, 0.8],
+                reduction='sum',
+                loss_weight=5.0),
+            center_loss=dict(
+                type='ChamferDistance',
+                mode='l2',
+                reduction='sum',
+                loss_src_weight=10.0,
+                loss_dst_weight=10.0),
+            dir_class_loss=dict(
+                type='mmdet.CrossEntropyLoss',
+                reduction='sum',
+                loss_weight=1.0),
+            dir_res_loss=dict(
+                type='mmdet.SmoothL1Loss', reduction='sum', loss_weight=10.0),
+            size_class_loss=dict(
+                type='mmdet.CrossEntropyLoss',
+                reduction='sum',
+                loss_weight=1.0),
+            size_res_loss=dict(
+                type='mmdet.SmoothL1Loss',
+                reduction='sum',
+                loss_weight=10.0 / 3.0),
+            semantic_loss=dict(
+                type='mmdet.CrossEntropyLoss',
+                reduction='sum',
+                loss_weight=1.0)),
+        joint=dict(
+            vote_module_cfg=dict(
+                in_channels=512,
+                vote_per_seed=1,
+                gt_per_seed=3,
+                conv_channels=(512, 256),
+                conv_cfg=dict(type='Conv1d'),
+                norm_cfg=dict(type='BN1d'),
+                norm_feats=True,
+                vote_loss=dict(
+                    type='ChamferDistance',
+                    mode='l1',
+                    reduction='none',
+                    loss_dst_weight=10.0)),
+            vote_aggregation_cfg=dict(
+                type='PointSAModule',
+                num_point=256,
+                radius=0.3,
+                num_sample=16,
+                mlp_channels=[512, 128, 128, 128],
+                use_xyz=True,
+                normalize_xyz=True)),
+        pts=dict(
+            vote_module_cfg=dict(
+                in_channels=256,
+                vote_per_seed=1,
+                gt_per_seed=3,
+                conv_channels=(256, 256),
+                conv_cfg=dict(type='Conv1d'),
+                norm_cfg=dict(type='BN1d'),
+                norm_feats=True,
+                vote_loss=dict(
+                    type='ChamferDistance',
+                    mode='l1',
+                    reduction='none',
+                    loss_dst_weight=10.0)),
+            vote_aggregation_cfg=dict(
+                type='PointSAModule',
+                num_point=256,
+                radius=0.3,
+                num_sample=16,
+                mlp_channels=[256, 128, 128, 128],
+                use_xyz=True,
+                normalize_xyz=True)),
+        img=dict(
+            vote_module_cfg=dict(
+                in_channels=256,
+                vote_per_seed=1,
+                gt_per_seed=3,
+                conv_channels=(256, 256),
+                conv_cfg=dict(type='Conv1d'),
+                norm_cfg=dict(type='BN1d'),
+                norm_feats=True,
+                vote_loss=dict(
+                    type='ChamferDistance',
+                    mode='l1',
+                    reduction='none',
+                    loss_dst_weight=10.0)),
+            vote_aggregation_cfg=dict(
+                type='PointSAModule',
+                num_point=256,
+                radius=0.3,
+                num_sample=16,
+                mlp_channels=[256, 128, 128, 128],
+                use_xyz=True,
+                normalize_xyz=True)),
+        loss_weights=[0.4, 0.3, 0.3]),
+    img_mlp=dict(
+        in_channel=18,
+        conv_channels=(256, 256),
+        conv_cfg=dict(type='Conv1d'),
+        norm_cfg=dict(type='BN1d'),
+        act_cfg=dict(type='ReLU')),
+    fusion_layer=dict(
+        type='VoteFusion',
+        num_classes=len(class_names),
+        max_imvote_per_pixel=3),
+    num_sampled_seed=1024,
+    freeze_img_branch=True,
+    # model training and testing settings
+    train_cfg=dict(
+        pts=dict(
+            pos_distance_thr=0.3, neg_distance_thr=0.6, sample_mode='vote')),
+    test_cfg=dict(
+        img_rcnn=dict(score_thr=0.1),
+        pts=dict(
+            sample_mode='seed',
+            nms_thr=0.25,
+            score_thr=0.05,
+            per_class_proposal=True)))
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=True,
+        load_dim=6,
+        use_dim=[0, 1, 2],
+        backend_args=backend_args),
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox=True,
+        with_label=True,
+        with_bbox_3d=True,
+        with_label_3d=True),
+    dict(type='Resize', scale=(1333, 600), keep_ratio=True),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+    ),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.523599, 0.523599],
+        scale_ratio_range=[0.85, 1.15],
+        shift_height=True),
+    dict(type='PointSample', num_points=20000),
+    dict(
+        type='Pack3DDetInputs',
+        keys=([
+            'img', 'gt_bboxes', 'gt_bboxes_labels', 'points', 'gt_bboxes_3d',
+            'gt_labels_3d'
+        ]))
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=backend_args),
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=True,
+        load_dim=6,
+        use_dim=[0, 1, 2],
+        backend_args=backend_args),
+    dict(type='Resize', scale=(1333, 600), keep_ratio=True),
+    dict(type='PointSample', num_points=20000),
+    dict(type='Pack3DDetInputs', keys=['img', 'points'])
+]
+train_dataloader = dict(dataset=dict(dataset=dict(pipeline=train_pipeline)))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+# may also use your own pre-trained image branch
+load_from = 'https://download.openmmlab.com/mmdetection3d/v1.0.0_models/imvotenet/imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class/imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class_20210819_225618-62eba6ce.pth'  # noqa
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (16 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=128)
+randomness = dict(seed=8)
--- a/mmde/configs/imvotenet/metafile.yml
+++ b/mmde/configs/imvotenet/metafile.yml
+Collections:
+  - Name: ImVoteNet
+    Metadata:
+      Training Data: SUNRGBD
+      Training Techniques:
+        - AdamW
+      Training Resources: 8x TITAN Xp
+      Architecture:
+        - Faster R-CNN
+        - VoteNet
+        - Feature Pyramid Network
+    Paper:
+      URL: https://arxiv.org/abs/2001.10692
+      Title: 'ImVoteNet: Boosting 3D Object Detection in Point Clouds with Image Votes'
+    README: configs/imvotenet/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/models/detectors/imvotenet.py#L56
+      Version: v0.12.0
+Models:
+  - Name: imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class
+    In Collection: ImVoteNet
+    Config: configs/imvotenet/imvotenet_faster-rcnn-r50_fpn_4xb2_sunrgbd-3d.py
+    Metadata:
+      Training Memory (GB): 2.1
+    Results:
+      - Task: Object Detection
+        Dataset: SUNRGBD-2D
+        Metrics:
+          AP@0.5: 62.70
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/imvotenet/imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class/imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class_20210819_225618-62eba6ce.pth
+  - Name: imvotenet_stage2_16x8_sunrgbd-3d-10class
+    In Collection: ImVoteNet
+    Config: configs/imvotenet/imvotenet_stage2_8xb16_sunrgbd-3d.py
+    Metadata:
+      Training Memory (GB): 9.4
+    Results:
+      - Task: 3D Object Detection
+        Dataset: SUNRGBD-3D
+        Metrics:
+          AP@0.25: 64.48
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/imvotenet/imvotenet_stage2_16x8_sunrgbd-3d-10class/imvotenet_stage2_16x8_sunrgbd-3d-10class_20210819_192851-1bcd1b97.pth
--- a/mmde/configs/imvoxelnet/README.md
+++ b/mmde/configs/imvoxelnet/README.md
+# ImVoxelNet: Image to Voxels Projection for Monocular and Multi-View General-Purpose 3D Object Detection
+> [ImVoxelNet: Image to Voxels Projection for Monocular and Multi-View General-Purpose 3D Object Detection](https://arxiv.org/abs/2106.01178)
+<!-- [ALGORITHM] -->
+## Abstract
+In this paper, we introduce the task of multi-view RGB-based 3D object detection as an end-to-end optimization problem. To address this problem, we propose ImVoxelNet, a novel fully convolutional method of 3D object detection based on posed monocular or multi-view RGB images. The number of monocular images in each multiview input can variate during training and inference; actually, this number might be unique for each multi-view input. ImVoxelNet successfully handles both indoor and outdoor scenes, which makes it general-purpose. Specifically, it achieves state-of-the-art results in car detection on KITTI (monocular) and nuScenes (multi-view) benchmarks among all methods that accept RGB images. Moreover, it surpasses existing RGB-based 3D object detection methods on the SUN RGB-D dataset. On ScanNet, ImVoxelNet sets a new benchmark for multi-view 3D object detection.
+<div align=center>
+<img src="https://user-images.githubusercontent.com/36950400/143871445-38a55168-b8cd-4520-8ed6-f5c8c8ea304a.png" width="800"/>
+</div>
+## Introduction
+We implement a monocular 3D detector ImVoxelNet and provide its results and checkpoints on KITTI dataset.
+Results for SUN RGB-D, ScanNet and nuScenes are currently available in ImVoxelNet authors
+[repo](https://github.com/saic-vul/imvoxelnet) (based on mmdetection3d).
+## Results and models
+### KITTI
+|                    Backbone                    | Class | Lr schd | Mem (GB) | Inf time (fps) |  mAP  |                                                                                                                                                               Download                                                                                                                                                               |
+| :--------------------------------------------: | :---: | :-----: | :------: | :------------: | :---: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [ResNet-50](./imvoxelnet_8xb4_kitti-3d-car.py) |  Car  |   3x    |          |                | 17.26 | [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/imvoxelnet/imvoxelnet_4x8_kitti-3d-car/imvoxelnet_4x8_kitti-3d-car_20210830_003014-3d0ffdf4.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/imvoxelnet/imvoxelnet_4x8_kitti-3d-car/imvoxelnet_4x8_kitti-3d-car_20210830_003014.log.json) |
+### SUN RGB-D
+|                      Backbone                       | Lr schd | Mem (GB) | Inf time (fps) | mAP@0.25 | mAP@0.5 |                                                                                                                                                                           Download                                                                                                                                                                           |
+| :-------------------------------------------------: | :-----: | :------: | :------------: | :------: | :-----: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [ResNet-50](./imvoxelnet_4x2_sunrgbd-3d-10class.py) |   2x    |   7.2    |      22.5      |  40.96   |  13.50  | [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/imvoxelnet/imvoxelnet_4x2_sunrgbd-3d-10class/imvoxelnet_4x2_sunrgbd-3d-10class_20220809_184416-29ca7d2e.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/imvoxelnet/imvoxelnet_4x2_sunrgbd-3d-10class/imvoxelnet_4x2_sunrgbd-3d-10class_20220809_184416.log.json) |
+## Citation
+```latex
+@article{rukhovich2021imvoxelnet,
+  title={ImVoxelNet: Image to Voxels Projection for Monocular and Multi-View General-Purpose 3D Object Detection},
+  author={Danila Rukhovich, Anna Vorontsova, Anton Konushin},
+  journal={arXiv preprint arXiv:2106.01178},
+  year={2021}
+}
+```