fix_mmdetection

eb1107e4 · raojy · 7aa442d5 · eb1107e4 · eb1107e4 · eb1107e4
Commit eb1107e4 authored Apr 01, 2026 by raojy
20 changed files
--- a/mmde/configs/nuimages/mask-rcnn_x101_32x4d_fpn_1x_nuim.py
+++ b/mmde/configs/nuimages/mask-rcnn_x101_32x4d_fpn_1x_nuim.py
+_base_ = './mask-rcnn_r50_fpn_1x_nuim.py'
+model = dict(
+    pretrained='open-mmlab://resnext101_32x4d',
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        style='pytorch'))
--- a/mmde/configs/nuimages/metafile.yml
+++ b/mmde/configs/nuimages/metafile.yml
+Collections:
+  - Name: Mask R-CNN
+    Metadata:
+      Training Data: nuImages
+      Training Techniques:
+        - SGD with Momentum
+        - Weight Decay
+      Training Resources: 8x TITAN Xp
+      Architecture:
+        - Softmax
+        - RPN
+        - Convolution
+        - Dense Connections
+        - FPN
+        - ResNet
+        - RoIAlign
+    Paper:
+      URL: https://arxiv.org/abs/1703.06870v3
+      Title: "Mask R-CNN"
+    README: configs/nuimages/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection/blob/v2.0.0/mmdet/models/detectors/mask_rcnn.py#L6
+      Version: v2.0.0
+
+Models:
+  - Name: mask-rcnn_r50_fpn_1x_nuim
+    In Collection: Mask R-CNN
+    Config: configs/nuimages/mask-rcnn_r50_fpn_1x_nuim.py
+    Metadata:
+      Training Memory (GB): 7.4
+      Training Resources: 8x TITAN Xp
+    Results:
+      - Task: Object Detection
+        Dataset: nuImages
+        Metrics:
+          Box AP: 47.8
+      - Task: Instance Segmentation
+        Dataset: nuImages
+        Metrics:
+          Mask AP: 38.4
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/mask_rcnn_r50_fpn_1x_nuim/mask_rcnn_r50_fpn_1x_nuim_20201008_195238-e99f5182.pth
+
+  - Name: mask-rcnn_r50_fpn_coco-2x_1x_nuim
+    In Collection: Mask R-CNN
+    Config: configs/nuimages/mask-rcnn_r50_fpn_coco-2x_1x_nuim.py
+    Metadata:
+      Training Memory (GB): 7.4
+      Training Resources: 8x TITAN Xp
+    Results:
+      - Task: Object Detection
+        Dataset: nuImages
+        Metrics:
+          Box AP: 49.7
+      - Task: Instance Segmentation
+        Dataset: nuImages
+        Metrics:
+          Mask AP: 40.5
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/mask_rcnn_r50_fpn_coco-2x_1x_nuim/mask_rcnn_r50_fpn_coco-2x_1x_nuim_20201008_195238-b1742a60.pth
+
+  - Name: mask-rcnn_r50_caffe_fpn_1x_nuim
+    In Collection: Mask R-CNN
+    Config: configs/nuimages/mask-rcnn_r50_caffe_fpn_1x_nuim.py
+    Metadata:
+      Training Memory (GB): 7.0
+      Training Resources: 8x TITAN Xp
+    Results:
+      - Task: Object Detection
+        Dataset: nuImages
+        Metrics:
+          Box AP: 47.7
+      - Task: Instance Segmentation
+        Dataset: nuImages
+        Metrics:
+          Mask AP: 38.2
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/mask_rcnn_r50_caffe_fpn_1x_nuim/
+
+  - Name: mask-rcnn_r50_caffe_fpn_coco-3x_1x_nuim
+    In Collection: Mask R-CNN
+    Config: configs/nuimages/mask-rcnn_r50_caffe_fpn_coco-3x_1x_nuim.py
+    Metadata:
+      Training Memory (GB): 7.0
+      Training Resources: 8x TITAN Xp
+    Results:
+      - Task: Object Detection
+        Dataset: nuImages
+        Metrics:
+          Box AP: 49.9
+      - Task: Instance Segmentation
+        Dataset: nuImages
+        Metrics:
+          Mask AP: 40.8
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/mask_rcnn_r50_caffe_fpn_coco-3x_1x_nuim/mask_rcnn_r50_caffe_fpn_coco-3x_1x_nuim_20201008_195305-661a992e.pth
+
+  - Name: mask-rcnn_r50_caffe_fpn_coco-3x_20e_nuim
+    In Collection: Mask R-CNN
+    Config: configs/nuimages/mask-rcnn_r50_caffe_fpn_coco-3x_20e_nuim.py
+    Metadata:
+      Training Memory (GB): 7.0
+      Training Resources: 8x TITAN Xp
+    Results:
+      - Task: Object Detection
+        Dataset: nuImages
+        Metrics:
+          Box AP: 50.6
+      - Task: Instance Segmentation
+        Dataset: nuImages
+        Metrics:
+          Mask AP: 41.3
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/mask_rcnn_r50_caffe_fpn_coco-3x_20e_nuim/mask_rcnn_r50_caffe_fpn_coco-3x_20e_nuim_20201009_125002-5529442c.pth
+
+  - Name: mask-rcnn_r101_fpn_1x_nuim
+    In Collection: Mask R-CNN
+    Config: configs/nuimages/mask-rcnn_r101_fpn_1x_nuim.py
+    Metadata:
+      Training Memory (GB): 10.9
+      Training Resources: 8x TITAN Xp
+    Results:
+      - Task: Object Detection
+        Dataset: nuImages
+        Metrics:
+          Box AP: 48.9
+      - Task: Instance Segmentation
+        Dataset: nuImages
+        Metrics:
+          Mask AP: 39.1
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/mask_rcnn_r101_fpn_1x_nuim/mask_rcnn_r101_fpn_1x_nuim_20201024_134803-65c7623a.pth
+
+  - Name: mask-rcnn_x101_32x4d_fpn_1x_nuim
+    In Collection: Mask R-CNN
+    Config: configs/nuimages/mask-rcnn_x101_32x4d_fpn_1x_nuim.py
+    Metadata:
+      Training Memory (GB): 13.3
+      Training Resources: 8x TITAN Xp
+    Results:
+      - Task: Object Detection
+        Dataset: nuImages
+        Metrics:
+          Box AP: 50.4
+      - Task: Instance Segmentation
+        Dataset: nuImages
+        Metrics:
+          Mask AP: 40.5
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/mask_rcnn_x101_32x4d_fpn_1x_nuim/mask_rcnn_x101_32x4d_fpn_1x_nuim_20201024_135741-b699ab37.pth
+
+  - Name: cascade-mask-rcnn_r50_fpn_1x_nuim
+    In Collection: Mask R-CNN
+    Config: configs/nuimages/cascade-mask-rcnn_r50_fpn_1x_nuim.py
+    Metadata:
+      Training Memory (GB): 8.9
+      Training Resources: 8x TITAN Xp
+    Results:
+      - Task: Object Detection
+        Dataset: nuImages
+        Metrics:
+          Box AP: 50.8
+      - Task: Instance Segmentation
+        Dataset: nuImages
+        Metrics:
+          Mask AP: 40.4
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/cascade_mask_rcnn_r50_fpn_1x_nuim/cascade_mask_rcnn_r50_fpn_1x_nuim_20201008_195342-1147c036.pth
+
+  - Name: cascade-mask-rcnn_r50_fpn_coco-20e_1x_nuim
+    In Collection: Mask R-CNN
+    Config: configs/nuimages/cascade-mask-rcnn_r50_fpn_coco-20e_1x_nuim.py
+    Metadata:
+      Training Memory (GB): 8.9
+      Training Resources: 8x TITAN Xp
+    Results:
+      - Task: Object Detection
+        Dataset: nuImages
+        Metrics:
+          Box AP: 52.8
+      - Task: Instance Segmentation
+        Dataset: nuImages
+        Metrics:
+          Mask AP: 42.2
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/cascade_mask_rcnn_r50_fpn_coco-20e_1x_nuim/cascade_mask_rcnn_r50_fpn_coco-20e_1x_nuim_20201009_124158-ad0540e3.pth
+
+  - Name: cascade-mask-rcnn_r50_fpn_coco-20e_20e_nuim
+    In Collection: Mask R-CNN
+    Config: configs/nuimages/cascade-mask-rcnn_r50_fpn_coco-20e_20e_nuim.py
+    Metadata:
+      Training Memory (GB): 8.9
+      Training Resources: 8x TITAN Xp
+    Results:
+      - Task: Object Detection
+        Dataset: nuImages
+        Metrics:
+          Box AP: 52.8
+      - Task: Instance Segmentation
+        Dataset: nuImages
+        Metrics:
+          Mask AP: 42.2
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/cascade_mask_rcnn_r50_fpn_coco-20e_20e_nuim/cascade_mask_rcnn_r50_fpn_coco-20e_20e_nuim_20201009_124951-40963960.pth
+
+  - Name: cascade-mask-rcnn_r101_fpn_1x_nuim
+    In Collection: Mask R-CNN
+    Config: configs/nuimages/cascade-mask-rcnn_r101_fpn_1x_nuim.py
+    Metadata:
+      Training Memory (GB): 12.5
+      Training Resources: 8x TITAN Xp
+    Results:
+      - Task: Object Detection
+        Dataset: nuImages
+        Metrics:
+          Box AP: 51.5
+      - Task: Instance Segmentation
+        Dataset: nuImages
+        Metrics:
+          Mask AP: 40.7
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/cascade_mask_rcnn_r101_fpn_1x_nuim/cascade_mask_rcnn_r101_fpn_1x_nuim_20201024_134804-45215b1e.pth
+
+  - Name: cascade-mask-rcnn_x101_32x4d_fpn_1x_nuim
+    In Collection: Mask R-CNN
+    Config: configs/nuimages/cascade-mask-rcnn_x101_32x4d_fpn_1x_nuim.py
+    Metadata:
+      Training Memory (GB): 14.9
+      Training Resources: 8x TITAN Xp
+    Results:
+      - Task: Object Detection
+        Dataset: nuImages
+        Metrics:
+          Box AP: 52.8
+      - Task: Instance Segmentation
+        Dataset: nuImages
+        Metrics:
+          Mask AP: 41.6
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/cascade_mask_rcnn_x101_32x4d_fpn_1x_nuim/cascade_mask_rcnn_x101_32x4d_fpn_1x_nuim_20201024_135753-e0e49778.pth
+
+  - Name: htc_r50_fpn_coco-20e_1x_nuim
+    In Collection: Mask R-CNN
+    Config: configs/nuimages/htc_r50_fpn_coco-20e_1x_nuim.py
+    Metadata:
+      Training Memory (GB): 11.6
+      Training Resources: 8x V100 GPUs
+    Results:
+      - Task: Object Detection
+        Dataset: nuImages
+        Metrics:
+          Box AP: 53.8
+      - Task: Instance Segmentation
+        Dataset: nuImages
+        Metrics:
+          Mask AP: 43.8
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/htc_r50_fpn_coco-20e_1x_nuim/htc_r50_fpn_coco-20e_1x_nuim_20201010_070203-0b53a65e.pth
+
+  - Name: htc_r50_fpn_coco-20e_20e_nuim
+    In Collection: Mask R-CNN
+    Config: configs/nuimages/htc_r50_fpn_coco-20e_20e_nuim.py
+    Metadata:
+      Training Memory (GB): 11.6
+      Training Resources: 8x V100 GPUs
+    Results:
+      - Task: Object Detection
+        Dataset: nuImages
+        Metrics:
+          Box AP: 54.8
+      - Task: Instance Segmentation
+        Dataset: nuImages
+        Metrics:
+          Mask AP: 44.4
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/htc_r50_fpn_coco-20e_20e_nuim/htc_r50_fpn_coco-20e_20e_nuim_20201008_211415-d6c60a2c.pth
+
+  - Name: htc_x101_64x4d_fpn_dconv_c3-c5_coco-20e_16x1_20e_nuim
+    In Collection: Mask R-CNN
+    Config: configs/nuimages/htc_x101_64x4d_fpn_dconv_c3-c5_coco-20e_16x1_20e_nuim.py
+    Metadata:
+      Training Memory (GB): 13.3
+      Training Resources: 8x V100 GPUs
+    Results:
+      - Task: Object Detection
+        Dataset: nuImages
+        Metrics:
+          Box AP: 57.3
+      - Task: Instance Segmentation
+        Dataset: nuImages
+        Metrics:
+          Mask AP: 46.4
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/htc_x101_64x4d_fpn_dconv_c3-c5_coco-20e_16x1_20e_nuim/htc_x101_64x4d_fpn_dconv_c3-c5_coco-20e_16x1_20e_nuim_20201008_211222-0b16ac4b.pth
--- a/mmde/configs/paconv/README.md
+++ b/mmde/configs/paconv/README.md
+# PAConv: Position Adaptive Convolution with Dynamic Kernel Assembling on Point Clouds
+
+> [PAConv: Position Adaptive Convolution with Dynamic Kernel Assembling on Point Clouds](https://arxiv.org/abs/2103.14635)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+We introduce Position Adaptive Convolution (PAConv), a generic convolution operation for 3D point cloud processing. The key of PAConv is to construct the convolution kernel by dynamically assembling basic weight matrices stored in Weight Bank, where the coefficients of these weight matrices are self-adaptively learned from point positions through ScoreNet. In this way, the kernel is built in a data-driven manner, endowing PAConv with more flexibility than 2D convolutions to better handle the irregular and unordered point cloud data. Besides, the complexity of the learning process is reduced by combining weight matrices instead of brutally predicting kernels from point positions.
+Furthermore, different from the existing point convolution operators whose network architectures are often heavily engineered, we integrate our PAConv into classical MLP-based point cloud pipelines without changing network configurations. Even built on simple networks, our method still approaches or even surpasses the state-of-the-art models, and significantly improves baseline performance on both classification and segmentation tasks, yet with decent efficiency. Thorough ablation studies and visualizations are provided to understand PAConv.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/79644370/143881915-003d5f10-3999-474e-969a-c354cb738a11.png" width="800"/>
+</div>
+
+## Introduction
+
+We implement PAConv and provide the result and checkpoints on S3DIS dataset.
+
+**Notice**: The original PAConv paper used step learning rate schedule. We discovered that cosine schedule achieves slightly better results and adopt it in our implementations.
+
+## Results and models
+
+### S3DIS
+
+|                              Method                               | Split  |   Lr schd   | Mem (GB) | Inf time (fps) | mIoU (Val set) |                                                                                                                                                                                                             Download                                                                                                                                                                                                             |
+| :---------------------------------------------------------------: | :----: | :---------: | :------: | :------------: | :------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|    [PAConv (SSG)](./paconv_ssg_8xb8-cosine-150e_s3dis-seg.py)     | Area_5 | cosine 150e |   5.8    |                |     66.65      |           [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/paconv/paconv_ssg_8x8_cosine_150e_s3dis_seg-3d-13class/paconv_ssg_8x8_cosine_150e_s3dis_seg-3d-13class_20210729_200615-2147b2d1.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/paconv/paconv_ssg_8x8_cosine_150e_s3dis_seg-3d-13class/paconv_ssg_8x8_cosine_150e_s3dis_seg-3d-13class_20210729_200615.log.json)           |
+| [PAConv\* (SSG)](./paconv_ssg-cuda_8xb8-cosine-200e_s3dis-seg.py) | Area_5 | cosine 200e |   3.8    |                |     65.33      | [model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/paconv/paconv_cuda_ssg_8x8_cosine_200e_s3dis_seg-3d-13class/paconv_cuda_ssg_8x8_cosine_200e_s3dis_seg-3d-13class_20210802_171802-e5ea9bb9.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/paconv/paconv_cuda_ssg_8x8_cosine_200e_s3dis_seg-3d-13class/paconv_cuda_ssg_8x8_cosine_200e_s3dis_seg-3d-13class_20210802_171802.log.json) |
+
+**Notes:**
+
+- We use XYZ+Color+Normalized_XYZ as input in all the experiments on S3DIS datasets.
+- `Area_5` Split means training the model on Area_1, 2, 3, 4, 6 and testing on Area_5.
+- PAConv\* stands for the CUDA implementation of PAConv operations. See the [paper](https://arxiv.org/pdf/2103.14635.pdf) appendix section D for more details. In our experiments, the training of PAConv\* is found to be very unstable. We achieved slightly lower mIoU than the result in the paper, but is consistent with the result obtained by running their [official code](https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg). Besides, although the GPU memory consumption of PAConv\* is significantly lower than PAConv, its training and inference speed are actually slower (by ~10%).
+
+## Indeterminism
+
+Since PAConv testing adopts sliding patch inference which involves random point sampling, and the test script uses fixed random seeds while the random seeds of validation in training are not fixed, the test results may be slightly different from the results reported above.
+
+## Citation
+
+```latex
+@inproceedings{xu2021paconv,
+  title={PAConv: Position Adaptive Convolution with Dynamic Kernel Assembling on Point Clouds},
+  author={Xu, Mutian and Ding, Runyu and Zhao, Hengshuang and Qi, Xiaojuan},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={3173--3182},
+  year={2021}
+}
+```
--- a/mmde/configs/paconv/metafile.yml
+++ b/mmde/configs/paconv/metafile.yml
+Collections:
+  - Name: PAConv
+    Metadata:
+      Training Techniques:
+        - SGD
+      Training Resources: 8x Titan XP GPUs
+      Architecture:
+        - PAConv
+    Paper:
+      URL: https://arxiv.org/abs/2103.14635
+      Title: 'PAConv: Position Adaptive Convolution with Dynamic Kernel Assembling on Point Clouds'
+    README: configs/paconv/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/ops/paconv/paconv.py#L106
+      Version: v0.16.0
+
+Models:
+  - Name: paconv_ssg_8xb8-cosine-150e_s3dis-seg.py
+    In Collection: PAConv
+    Config: configs/paconv/paconv_ssg_8xb8-cosine-150e_s3dis-seg.py
+    Metadata:
+      Training Data: S3DIS
+      Training Memory (GB): 5.8
+    Results:
+      - Task: 3D Semantic Segmentation
+        Dataset: S3DIS
+        Metrics:
+          mIoU: 66.65
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/paconv/paconv_ssg_8x8_cosine_150e_s3dis_seg-3d-13class/paconv_ssg_8x8_cosine_150e_s3dis_seg-3d-13class_20210729_200615-2147b2d1.pth
+
+  - Name: paconv_ssg-cuda_8xb8-cosine-200e_s3dis-seg
+    In Collection: PAConv
+    Config: configs/paconv/paconv_ssg-cuda_8xb8-cosine-200e_s3dis-seg.py
+    Metadata:
+      Training Data: S3DIS
+      Training Memory (GB): 5.8
+    Results:
+      - Task: 3D Semantic Segmentation
+        Dataset: S3DIS
+        Metrics:
+          mIoU: 66.65
+    Weights: https://download.openmmlab.com/mmdetection3d/v0.1.0_models/paconv/paconv_cuda_ssg_8x8_cosine_200e_s3dis_seg-3d-13class/paconv_cuda_ssg_8x8_cosine_200e_s3dis_seg-3d-13class_20210802_171802-e5ea9bb9.pth
--- a/mmde/configs/paconv/paconv_ssg-cuda_8xb8-cosine-200e_s3dis-seg.py
+++ b/mmde/configs/paconv/paconv_ssg-cuda_8xb8-cosine-200e_s3dis-seg.py
+_base_ = [
+    '../_base_/datasets/s3dis-seg.py', '../_base_/models/paconv_ssg-cuda.py',
+    '../_base_/schedules/seg-cosine-150e.py', '../_base_/default_runtime.py'
+]
+
+# model settings
+model = dict(
+    decode_head=dict(
+        num_classes=13, ignore_index=13,
+        loss_decode=dict(class_weight=None)),  # S3DIS doesn't use class_weight
+    test_cfg=dict(
+        num_points=4096,
+        block_size=1.0,
+        sample_rate=0.5,
+        use_normalized_coord=True,
+        batch_size=12))
+
+# data settings
+num_points = 4096
+backend_args = None
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5],
+        backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_mask_3d=False,
+        with_seg_3d=True,
+        backend_args=backend_args),
+    dict(type='PointSegClassMapping'),
+    dict(
+        type='IndoorPatchPointSample',
+        num_points=num_points,
+        block_size=1.0,
+        use_normalized_coord=True,
+        num_try=10000,
+        enlarge_size=None,
+        min_unique_num=num_points // 4,
+        eps=0.0),
+    dict(type='NormalizePointsColor', color_mean=None),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[0.0, 6.283185307179586],  # [0, 2 * pi]
+        scale_ratio_range=[0.8, 1.2],
+        translation_std=[0, 0, 0]),
+    dict(
+        type='RandomJitterPoints',
+        jitter_std=[0.01, 0.01, 0.01],
+        clip_range=[-0.05, 0.05]),
+    dict(type='RandomDropPointsColor', drop_ratio=0.2),
+    dict(type='Pack3DDetInputs', keys=['points', 'pts_semantic_mask'])
+]
+
+train_dataloader = dict(batch_size=8, dataset=dict(pipeline=train_pipeline))
+
+# runtime settings
+train_cfg = dict(by_epoch=True, max_epochs=200, val_interval=1)
--- a/mmde/configs/paconv/paconv_ssg_8xb8-cosine-150e_s3dis-seg.py
+++ b/mmde/configs/paconv/paconv_ssg_8xb8-cosine-150e_s3dis-seg.py
+_base_ = [
+    '../_base_/datasets/s3dis-seg.py', '../_base_/models/paconv_ssg.py',
+    '../_base_/schedules/seg-cosine-150e.py', '../_base_/default_runtime.py'
+]
+
+# model settings
+model = dict(
+    decode_head=dict(
+        num_classes=13, ignore_index=13,
+        loss_decode=dict(class_weight=None)),  # S3DIS doesn't use class_weight
+    test_cfg=dict(
+        num_points=4096,
+        block_size=1.0,
+        sample_rate=0.5,
+        use_normalized_coord=True,
+        batch_size=12))
+
+# data settings
+num_points = 4096
+backend_args = None
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='DEPTH',
+        shift_height=False,
+        use_color=True,
+        load_dim=6,
+        use_dim=[0, 1, 2, 3, 4, 5],
+        backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=False,
+        with_label_3d=False,
+        with_mask_3d=False,
+        with_seg_3d=True,
+        backend_args=backend_args),
+    dict(type='PointSegClassMapping'),
+    dict(
+        type='IndoorPatchPointSample',
+        num_points=num_points,
+        block_size=1.0,
+        use_normalized_coord=True,
+        num_try=10000,
+        enlarge_size=None,
+        min_unique_num=num_points // 4,
+        eps=0.0),
+    dict(type='NormalizePointsColor', color_mean=None),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[0.0, 6.283185307179586],  # [0, 2 * pi]
+        scale_ratio_range=[0.8, 1.2],
+        translation_std=[0, 0, 0]),
+    dict(
+        type='RandomJitterPoints',
+        jitter_std=[0.01, 0.01, 0.01],
+        clip_range=[-0.05, 0.05]),
+    dict(type='RandomDropPointsColor', drop_ratio=0.2),
+    dict(type='Pack3DDetInputs', keys=['points', 'pts_semantic_mask'])
+]
+
+train_dataloader = dict(batch_size=8, dataset=dict(pipeline=train_pipeline))
--- a/mmde/configs/parta2/README.md
+++ b/mmde/configs/parta2/README.md
+# From Points to Parts: 3D Object Detection from Point Cloud with Part-aware and Part-aggregation Network
+
+> [From Points to Parts: 3D Object Detection from Point Cloud with Part-aware and Part-aggregation Network](https://arxiv.org/abs/1907.03670)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+3D object detection from LiDAR point cloud is a challenging problem in 3D scene understanding and has many practical applications. In this paper, we extend our preliminary work PointRCNN to a novel and strong point-cloud-based 3D object detection framework, the part-aware and aggregation neural network (Part-A2 net). The whole framework consists of the part-aware stage and the part-aggregation stage. Firstly, the part-aware stage for the first time fully utilizes free-of-charge part supervisions derived from 3D ground-truth boxes to simultaneously predict high quality 3D proposals and accurate intra-object part locations. The predicted intra-object part locations within the same proposal are grouped by our new-designed RoI-aware point cloud pooling module, which results in an effective representation to encode the geometry-specific features of each 3D proposal. Then the part-aggregation stage learns to re-score the box and refine the box location by exploring the spatial relationship of the pooled intra-object part locations. Extensive experiments are conducted to demonstrate the performance improvements from each component of our proposed framework. Our Part-A2 net outperforms all existing 3D detection methods and achieves new state-of-the-art on KITTI 3D object detection dataset by utilizing only the LiDAR point cloud data.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/79644370/143882774-6fc5f736-10d1-499a-8929-ca0768419049.png" width="800"/>
+</div>
+
+## Introduction
+
+We implement Part-A^2 and provide its results and checkpoints on KITTI dataset.
+
+## Results and models
+
+### KITTI
+
+|                            Backbone                             |  Class  |  Lr schd   | Mem (GB) | Inf time (fps) |  mAP  |                                                                                                                                                                                                   Download                                                                                                                                                                                                   |
+| :-------------------------------------------------------------: | :-----: | :--------: | :------: | :------------: | :---: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [SECFPN](./parta2_hv_secfpn_8xb2-cyclic-80e_kitti-3d-3class.py) | 3 Class | cyclic 80e |   4.1    |                | 68.33 | [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/parta2/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-3class/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-3class_20210831_022017-454a5344.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/parta2/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-3class/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-3class_20210831_022017.log.json) |
+|  [SECFPN](./parta2_hv_secfpn_8xb2-cyclic-80e_kitti-3d-car.py)   |   Car   | cyclic 80e |   4.0    |                | 79.08 |       [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/parta2/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-car/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-car_20210831_022017-cb7ff621.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/parta2/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-car/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-car_20210831_022017.log.json)       |
+
+## Citation
+
+```latex
+@article{shi2020points,
+  title={From points to parts: 3d object detection from point cloud with part-aware and part-aggregation network},
+  author={Shi, Shaoshuai and Wang, Zhe and Shi, Jianping and Wang, Xiaogang and Li, Hongsheng},
+  journal={IEEE Transactions on Pattern Analysis and Machine Intelligence},
+  year={2020},
+  publisher={IEEE}
+}
+```
--- a/mmde/configs/parta2/metafile.yml
+++ b/mmde/configs/parta2/metafile.yml
+Collections:
+  - Name: Part-A^2
+    Metadata:
+      Training Data: KITTI
+      Training Techniques:
+        - AdamW
+      Training Resources: 8x V100 GPUs
+      Architecture:
+        - Sparse U-Net
+    Paper:
+      URL: https://arxiv.org/abs/1907.03670
+      Title: 'From Points to Parts: 3D Object Detection from Point Cloud with Part-aware and Part-aggregation Network'
+    README: configs/parta2/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/models/detectors/parta2.py#L12
+      Version: v0.5.0
+
+Models:
+  - Name: parta2_hv_secfpn_8xb2-cyclic-80e_kitti-3d-3class
+    In Collection: Part-A^2
+    Config: configs/parta2/parta2_hv_secfpn_8xb2-cyclic-80e_kitti-3d-3class.py
+    Metadata:
+      Training Memory (GB): 4.1
+    Results:
+      - Task: 3D Object Detection
+        Dataset: KITTI
+        Metrics:
+          mAP: 68.33
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/parta2/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-3class/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-3class_20210831_022017-454a5344.pth
+
+  - Name: parta2_hv_secfpn_8xb2-cyclic-80e_kitti-3d-car
+    In Collection: Part-A^2
+    Config: configs/parta2/parta2_hv_secfpn_8xb2-cyclic-80e_kitti-3d-car.py
+    Metadata:
+      Training Memory (GB): 4.0
+    Results:
+      - Task: 3D Object Detection
+        Dataset: KITTI
+        Metrics:
+          mAP: 79.08
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/parta2/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-car/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-car_20210831_022017-cb7ff621.pth
--- a/mmde/configs/parta2/parta2_hv_secfpn_8xb2-cyclic-80e_kitti-3d-3class.py
+++ b/mmde/configs/parta2/parta2_hv_secfpn_8xb2-cyclic-80e_kitti-3d-3class.py
+_base_ = [
+    '../_base_/schedules/cyclic-40e.py', '../_base_/default_runtime.py',
+    '../_base_/models/parta2.py'
+]
+
+point_cloud_range = [0, -40, -3, 70.4, 40, 1]
+
+# dataset settings
+dataset_type = 'KittiDataset'
+data_root = 'data/kitti/'
+class_names = ['Pedestrian', 'Cyclist', 'Car']
+input_modality = dict(use_lidar=True, use_camera=False)
+backend_args = None
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'kitti_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(
+        filter_by_difficulty=[-1],
+        filter_by_min_points=dict(Car=5, Pedestrian=10, Cyclist=10)),
+    classes=class_names,
+    sample_groups=dict(Car=12, Pedestrian=6, Cyclist=6),
+    points_loader=dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    backend_args=backend_args)
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(type='ObjectSample', db_sampler=db_sampler),
+    dict(
+        type='ObjectNoise',
+        num_try=100,
+        translation_std=[1.0, 1.0, 0.5],
+        global_rot_range=[0.0, 0.0],
+        rot_range=[-0.78539816, 0.78539816]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05]),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='PointShuffle'),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range)
+        ]),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+# construct a pipeline for data and gt loading in show function
+# please keep its loading function consistent with test_pipeline (e.g. client)
+eval_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=2,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='kitti_infos_train.pkl',
+            data_prefix=dict(pts='training/velodyne_reduced'),
+            pipeline=train_pipeline,
+            modality=input_modality,
+            metainfo=dict(classes=class_names),
+            box_type_3d='LiDAR',
+            test_mode=False,
+            backend_args=backend_args)))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='kitti_infos_val.pkl',
+        data_prefix=dict(pts='training/velodyne_reduced'),
+        pipeline=test_pipeline,
+        modality=input_modality,
+        metainfo=dict(classes=class_names),
+        box_type_3d='LiDAR',
+        test_mode=True,
+        backend_args=backend_args))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='kitti_infos_val.pkl',
+        data_prefix=dict(pts='training/velodyne_reduced'),
+        pipeline=eval_pipeline,
+        modality=input_modality,
+        metainfo=dict(classes=class_names),
+        box_type_3d='LiDAR',
+        test_mode=True,
+        backend_args=backend_args))
+val_evaluator = dict(
+    type='KittiMetric',
+    ann_file=data_root + 'kitti_infos_val.pkl',
+    metric='bbox',
+    backend_args=backend_args)
+test_evaluator = val_evaluator
+# Part-A2 uses a different learning rate from what SECOND uses.
+optim_wrapper = dict(optimizer=dict(lr=0.001))
+find_unused_parameters = True
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
--- a/mmde/configs/parta2/parta2_hv_secfpn_8xb2-cyclic-80e_kitti-3d-car.py
+++ b/mmde/configs/parta2/parta2_hv_secfpn_8xb2-cyclic-80e_kitti-3d-car.py
+_base_ = './parta2_hv_secfpn_8xb2-cyclic-80e_kitti-3d-3class.py'
+
+point_cloud_range = [0, -40, -3, 70.4, 40, 1]  # velodyne coordinates, x, y, z
+
+model = dict(
+    rpn_head=dict(
+        type='PartA2RPNHead',
+        num_classes=1,
+        anchor_generator=dict(
+            _delete_=True,
+            type='Anchor3DRangeGenerator',
+            ranges=[[0, -40.0, -1.78, 70.4, 40.0, -1.78]],
+            sizes=[[3.9, 1.6, 1.56]],
+            rotations=[0, 1.57],
+            reshape_out=False)),
+    roi_head=dict(
+        num_classes=1,
+        semantic_head=dict(num_classes=1),
+        bbox_head=dict(num_classes=1)),
+    # model training and testing settings
+    train_cfg=dict(
+        _delete_=True,
+        rpn=dict(
+            assigner=dict(
+                type='Max3DIoUAssigner',
+                iou_calculator=dict(type='BboxOverlapsNearest3D'),
+                pos_iou_thr=0.6,
+                neg_iou_thr=0.45,
+                min_pos_iou=0.45,
+                ignore_iof_thr=-1),
+            allowed_border=0,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=9000,
+            nms_post=512,
+            max_num=512,
+            nms_thr=0.8,
+            score_thr=0,
+            use_rotate_nms=False),
+        rcnn=dict(
+            assigner=dict(  # for Car
+                type='Max3DIoUAssigner',
+                iou_calculator=dict(type='BboxOverlaps3D', coordinate='lidar'),
+                pos_iou_thr=0.55,
+                neg_iou_thr=0.55,
+                min_pos_iou=0.55,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='IoUNegPiecewiseSampler',
+                num=128,
+                pos_fraction=0.55,
+                neg_piece_fractions=[0.8, 0.2],
+                neg_iou_piece_thrs=[0.55, 0.1],
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False,
+                return_iou=True),
+            cls_pos_thr=0.75,
+            cls_neg_thr=0.25)),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1024,
+            nms_post=100,
+            max_num=100,
+            nms_thr=0.7,
+            score_thr=0,
+            use_rotate_nms=True),
+        rcnn=dict(
+            use_rotate_nms=True,
+            use_raw_score=True,
+            nms_thr=0.01,
+            score_thr=0.1)))
+
+# dataset settings
+dataset_type = 'KittiDataset'
+data_root = 'data/kitti/'
+class_names = ['Car']
+input_modality = dict(use_lidar=True, use_camera=False)
+backend_args = None
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'kitti_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(filter_by_difficulty=[-1], filter_by_min_points=dict(Car=5)),
+    classes=class_names,
+    sample_groups=dict(Car=15),
+    points_loader=dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    backend_args=backend_args)
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
+    dict(type='ObjectSample', db_sampler=db_sampler),
+    dict(
+        type='ObjectNoise',
+        num_try=100,
+        translation_std=[1.0, 1.0, 0.5],
+        global_rot_range=[0.0, 0.0],
+        rot_range=[-0.78539816, 0.78539816]),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='GlobalRotScaleTrans',
+        rot_range=[-0.78539816, 0.78539816],
+        scale_ratio_range=[0.95, 1.05]),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectNameFilter', classes=class_names),
+    dict(type='PointShuffle'),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=4,
+        use_dim=4,
+        backend_args=backend_args),
+    dict(
+        type='MultiScaleFlipAug3D',
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(
+                type='GlobalRotScaleTrans',
+                rot_range=[0, 0],
+                scale_ratio_range=[1., 1.],
+                translation_std=[0, 0, 0]),
+            dict(type='RandomFlip3D'),
+            dict(
+                type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+        ]),
+    dict(type='Pack3DDetInputs', keys=['points'])
+]
+
+train_dataloader = dict(
+    dataset=dict(
+        dataset=dict(
+            pipeline=train_pipeline, metainfo=dict(classes=class_names))))
+test_dataloader = dict(
+    dataset=dict(pipeline=test_pipeline, metainfo=dict(classes=class_names)))
+val_dataloader = dict(dataset=dict(metainfo=dict(classes=class_names)))
+find_unused_parameters = True
--- a/mmde/configs/pgd/README.md
+++ b/mmde/configs/pgd/README.md
+# Probabilistic and Geometric Depth: Detecting Objects in Perspective
+
+> [Probabilistic and Geometric Depth: Detecting Objects in Perspective](https://arxiv.org/abs/2107.14160)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+3D object detection is an important capability needed in various practical applications such as driver assistance systems. Monocular 3D detection, as a representative general setting among image-based approaches, provides a more economical solution than conventional settings relying on LiDARs but still yields unsatisfactory results. This paper first presents a systematic study on this problem. We observe that the current monocular 3D detection can be simplified as an instance depth estimation problem: The inaccurate instance depth blocks all the other 3D attribute predictions from improving the overall detection performance. Moreover, recent methods directly estimate the depth based on isolated instances or pixels while ignoring the geometric relations across different objects. To this end, we construct geometric relation graphs across predicted objects and use the graph to facilitate depth estimation. As the preliminary depth estimation of each instance is usually inaccurate in this ill-posed setting, we incorporate a probabilistic representation to capture the uncertainty. It provides an important indicator to identify confident predictions and further guide the depth propagation. Despite the simplicity of the basic idea, our method, PGD, obtains significant improvements on KITTI and nuScenes benchmarks, achieving 1st place out of all monocular vision-only methods while still maintaining real-time efficiency. Code and models will be released at [this https URL](https://github.com/open-mmlab/mmdetection3d).
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/79644370/143884065-d1a19fdf-bcc0-4249-84cf-b7a85fa1eb2f.png" width="800"/>
+</div>
+
+## Introduction
+
+PGD, also can be regarded as FCOS3D++, is a simple yet effective monocular 3D detector. It enhances the FCOS3D baseline by involving local geometric constraints and improving instance depth estimation.
+
+We release the code and model for both KITTI and nuScenes benchmark, which is a good supplement for the original FCOS3D baseline (only supported on nuScenes).
+
+For clean implementation, our preliminary release supports base models with proposed local geometric constraints and the probabilistic depth representation. We will involve the geometric graph part in the future.
+
+A more extensive study based on FCOS3D and PGD is on-going. Please stay tuned.
+
+## Results and models
+
+### KITTI
+
+|                             Backbone                              | Lr schd | Mem (GB) | Inf time (fps) | mAP_11 / mAP_40 |                                                                                                                                                                                              Download                                                                                                                                                                                              |
+| :---------------------------------------------------------------: | :-----: | :------: | :------------: | :-------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [ResNet101](./pgd_r101-caffe_fpn_head-gn_4xb3-4x_kitti-mono3d.py) |   4x    |   9.07   |                |  18.33 / 13.23  | [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pgd/pgd_r101_caffe_fpn_gn-head_3x4_4x_kitti-mono3d/pgd_r101_caffe_fpn_gn-head_3x4_4x_kitti-mono3d_20211022_102608-8a97533b.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pgd/pgd_r101_caffe_fpn_gn-head_3x4_4x_kitti-mono3d/pgd_r101_caffe_fpn_gn-head_3x4_4x_kitti-mono3d_20211022_102608.log.json) |
+
+Detailed performance on KITTI 3D detection (3D/BEV) is as follows, evaluated by AP11 and AP40 metric:
+
+|            |     Easy      |   Moderate    |     Hard      |
+| ---------- | :-----------: | :-----------: | :-----------: |
+| Car (AP11) | 24.09 / 30.11 | 18.33 / 23.46 | 16.90 / 19.33 |
+| Car (AP40) | 19.27 / 26.60 | 13.23 / 18.23 | 10.65 / 15.00 |
+
+Note: mAP represents Car moderate 3D strict AP11 / AP40 results. Because of the limited data for pedestrians and cyclists, the detection performance for these two classes is usually unstable. Therefore, we only list car detection results here. In addition, AP40 is a more recommended metric for reference due to its much better stability.
+
+### NuScenes
+
+|                                     Backbone                                      | Lr schd | Mem (GB) | mAP  | NDS  |                                                                                                                                                                                                              Download                                                                                                                                                                                                              |
+| :-------------------------------------------------------------------------------: | :-----: | :------: | :--: | :--: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|      [ResNet101 w/ DCN](./pgd_r101-caffe_fpn_head-gn_16xb2-1x_nus-mono3d.py)      |   1x    |   9.20   | 31.7 | 39.3 |                   [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pgd/pgd_r101_caffe_fpn_gn-head_2x16_1x_nus-mono3d/pgd_r101_caffe_fpn_gn-head_2x16_1x_nus-mono3d_20211116_195350-f4b5eec2.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pgd/pgd_r101_caffe_fpn_gn-head_2x16_1x_nus-mono3d/pgd_r101_caffe_fpn_gn-head_2x16_1x_nus-mono3d_20211116_195350.log.json)                   |
+| [above w/ finetune](./pgd_r101-caffe_fpn_head-gn_16xb2-1x_nus-mono3d_finetune.py) |   1x    |   9.20   | 34.6 | 41.1 | [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pgd/pgd_r101_caffe_fpn_gn-head_2x16_1x_nus-mono3d_finetune/pgd_r101_caffe_fpn_gn-head_2x16_1x_nus-mono3d_finetune_20211118_093245-fd419681.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pgd/pgd_r101_caffe_fpn_gn-head_2x16_1x_nus-mono3d_finetune/pgd_r101_caffe_fpn_gn-head_2x16_1x_nus-mono3d_finetune_20211118_093245.log.json) |
+|                                   above w/ tta                                    |   1x    |   9.20   | 35.5 | 41.8 |                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+|      [ResNet101 w/ DCN](./pgd_r101-caffe_fpn_head-gn_16xb2-2x_nus-mono3d.py)      |   2x    |   9.20   | 33.6 | 40.9 |                   [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pgd/pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d/pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d_20211112_125314-cb677266.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pgd/pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d/pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d_20211112_125314.log.json)                   |
+| [above w/ finetune](./pgd_r101-caffe_fpn_head-gn_16xb2-2x_nus-mono3d_finetune.py) |   2x    |   9.20   | 35.8 | 42.5 | [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pgd/pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d_finetune/pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d_finetune_20211114_162135-5ec7c1cd.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pgd/pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d_finetune/pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d_finetune_20211114_162135.log.json) |
+|                                   above w/ tta                                    |   2x    |   9.20   | 36.8 | 43.1 |                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+
+### Waymo
+
+|                                   Backbone                                   | Load Interval |  Camera view  | mAPL | mAP  | mAPH  |                                                                                             Download                                                                                              |
+| :--------------------------------------------------------------------------: | :-----------: | :-----------: | :--: | :--: | :---: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [ResNet101 w/ DCN](./pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-fov-mono3d.py) |      3x       | front-of-view | 15.8 | 22.7 | 21.51 | [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/pgd/pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-fov-mono3d/pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-fov-mono3d_20231107_164117.log) |
+|                                 above @ Car                                  |               |               | 36.7 | 51.6 | 51.0  |                                                                                                                                                                                                   |
+|                              above @ Pedestrian                              |               |               | 9.0  | 14.1 | 11.4  |                                                                                                                                                                                                   |
+|                               above @ Cyclist                                |               |               | 1.6  | 2.5  |  2.2  |                                                                                                                                                                                                   |
+| [ResNet101 w/ DCN](./pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-mv-mono3d.py)  |      3x       |  multi-view   | 20.8 | 29.3 | 27.7  |  [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/pgd/pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-mv-mono3d/pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-mv-mono3d_20231120_202732.log)  |
+|                                 above @ Car                                  |               |               | 41.2 | 56.1 | 55.2  |                                                                                                                                                                                                   |
+|                              above @ Pedestrian                              |               |               | 20.0 | 29.6 | 25.8  |                                                                                                                                                                                                   |
+|                               above @ Cyclist                                |               |               | 1.4  | 2.2  |  2.0  |                                                                                                                                                                                                   |
+
+**Note**:
+
+Regrettably, we are unable to provide the pre-trained model weights due to [Waymo Dataset License Agreement](https://waymo.com/open/terms/), so we only provide the training logs as shown above.
+
+## Citation
+
+```latex
+@inproceedings{wang2021pgd,
+    title={{Probabilistic and Geometric Depth: Detecting} Objects in Perspective},
+    author={Wang, Tai and Zhu, Xinge and Pang, Jiangmiao and Lin, Dahua},
+    booktitle={Conference on Robot Learning (CoRL) 2021},
+    year={2021}
+}
+# For the baseline version
+@inproceedings{wang2021fcos3d,
+    title={{FCOS3D: Fully} Convolutional One-Stage Monocular 3D Object Detection},
+    author={Wang, Tai and Zhu, Xinge and Pang, Jiangmiao and Lin, Dahua},
+    booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV) Workshops},
+    year={2021}
+}
+```
--- a/mmde/configs/pgd/metafile.yml
+++ b/mmde/configs/pgd/metafile.yml
+Collections:
+  - Name: PGD
+    Metadata:
+      Training Data: KITTI
+      Training Techniques:
+        - SGD
+      Training Resources: 4x TITAN XP
+      Architecture:
+        - PGDHead
+    Paper:
+      URL: https://arxiv.org/abs/2107.14160
+      Title: 'Probabilistic and Geometric Depth: Detecting Objects in Perspective'
+    README: configs/pgd/README.md
+    Code:
+      URL: https://github.com/open-mmlab/mmdetection3d/blob/v1.0.0.dev0/mmdet3d/models/dense_heads/pgd_head.py#17
+      Version: v1.0.0
+
+Models:
+  - Name: pgd_r101-caffe_fpn_head-gn_4xb3-4x_kitti-mono3d
+    Alias:
+       - pgd_kitti
+    In Collection: PGD
+    Config: configs/pgd/pgd_r101-caffe_fpn_head-gn_4xb3-4x_kitti-mono3d.py
+    Metadata:
+      Training Memory (GB): 9.1
+    Results:
+      - Task: 3D Object Detection
+        Dataset: KITTI
+        Metrics:
+          mAP: 18.33
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pgd/pgd_r101_caffe_fpn_gn-head_3x4_4x_kitti-mono3d/pgd_r101_caffe_fpn_gn-head_3x4_4x_kitti-mono3d_20211022_102608-8a97533b.pth
+
+  - Name: pgd_r101-caffe_fpn_head-gn_16xb2-1x_nus-mono3d
+    In Collection: PGD
+    Config: configs/pgd/pgd_r101-caffe_fpn_head-gn_16xb2-1x_nus-mono3d.py
+    Metadata:
+      Training Memory (GB): 9.2
+    Results:
+      - Task: 3D Object Detection
+        Dataset: nuScenes
+        Metrics:
+          mAP: 31.7
+          NDS: 39.3
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pgd/pgd_r101_caffe_fpn_gn-head_2x16_1x_nus-mono3d/pgd_r101_caffe_fpn_gn-head_2x16_1x_nus-mono3d_20211116_195350-f4b5eec2.pth
+
+  - Name: pgd_r101-caffe_fpn_head-gn_16xb2-1x_nus-mono3d_finetune
+    In Collection: PGD
+    Config: configs/pgd/pgd_r101-caffe_fpn_head-gn_16xb2-1x_nus-mono3d_finetune.py
+    Metadata:
+      Training Memory (GB): 9.2
+    Results:
+      - Task: 3D Object Detection
+        Dataset: nuScenes
+        Metrics:
+          mAP: 34.6
+          NDS: 41.1
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pgd/pgd_r101_caffe_fpn_gn-head_2x16_1x_nus-mono3d_finetune/pgd_r101_caffe_fpn_gn-head_2x16_1x_nus-mono3d_finetune_20211118_093245-fd419681.pth
+
+  - Name: pgd_r101-caffe_fpn_head-gn_16xb2-2x_nus-mono3d
+    In Collection: PGD
+    Config: configs/pgd/pgd_r101-caffe_fpn_head-gn_16xb2-2x_nus-mono3d.py
+    Metadata:
+      Training Memory (GB): 9.2
+    Results:
+      - Task: 3D Object Detection
+        Dataset: nuScenes
+        Metrics:
+          mAP: 33.6
+          NDS: 40.9
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pgd/pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d/pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d_20211112_125314-cb677266.pth
+
+  - Name: pgd_r101-caffe_fpn_head-gn_16xb2-2x_nus-mono3d_finetune
+    In Collection: PGD
+    Config: configs/pgd/pgd_r101-caffe_fpn_head-gn_16xb2-2x_nus-mono3d_finetune.py
+    Metadata:
+      Training Memory (GB): 9.2
+    Results:
+      - Task: 3D Object Detection
+        Dataset: nuScenes
+        Metrics:
+          mAP: 35.8
+          NDS: 42.5
+    Weights: https://download.openmmlab.com/mmdetection3d/v1.0.0_models/pgd/pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d_finetune/pgd_r101_caffe_fpn_gn-head_2x16_2x_nus-mono3d_finetune_20211114_162135-5ec7c1cd.pth
--- a/mmde/configs/pgd/pgd_r101-caffe_fpn_head-gn_16xb2-1x_nus-mono3d.py
+++ b/mmde/configs/pgd/pgd_r101-caffe_fpn_head-gn_16xb2-1x_nus-mono3d.py
+_base_ = [
+    '../_base_/datasets/nus-mono3d.py', '../_base_/models/pgd.py',
+    '../_base_/schedules/mmdet-schedule-1x.py', '../_base_/default_runtime.py'
+]
+# model settings
+model = dict(
+    data_preprocessor=dict(
+        type='Det3DDataPreprocessor',
+        mean=[103.530, 116.280, 123.675],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False,
+        pad_size_divisor=32),
+    backbone=dict(
+        dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, False, True, True)),
+    bbox_head=dict(
+        pred_bbox2d=True,
+        group_reg_dims=(2, 1, 3, 1, 2,
+                        4),  # offset, depth, size, rot, velo, bbox2d
+        reg_branch=(
+            (256, ),  # offset
+            (256, ),  # depth
+            (256, ),  # size
+            (256, ),  # rot
+            (),  # velo
+            (256, )  # bbox2d
+        ),
+        loss_depth=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+        bbox_coder=dict(
+            type='PGDBBoxCoder',
+            base_depths=((31.99, 21.12), (37.15, 24.63), (39.69, 23.97),
+                         (40.91, 26.34), (34.16, 20.11), (22.35, 13.70),
+                         (24.28, 16.05), (27.26, 15.50), (20.61, 13.68),
+                         (22.74, 15.01)),
+            base_dims=((4.62, 1.73, 1.96), (6.93, 2.83, 2.51),
+                       (12.56, 3.89, 2.94), (11.22, 3.50, 2.95),
+                       (6.68, 3.21, 2.85), (6.68, 3.21, 2.85),
+                       (2.11, 1.46, 0.78), (0.73, 1.77, 0.67),
+                       (0.41, 1.08, 0.41), (0.50, 0.99, 2.52)),
+            code_size=9)),
+    # set weight 1.0 for base 7 dims (offset, depth, size, rot)
+    # 0.05 for 2-dim velocity and 0.2 for 4-dim 2D distance targets
+    train_cfg=dict(code_weight=[
+        1.0, 1.0, 0.2, 1.0, 1.0, 1.0, 1.0, 0.05, 0.05, 0.2, 0.2, 0.2, 0.2
+    ]),
+    test_cfg=dict(nms_pre=1000, nms_thr=0.8, score_thr=0.01, max_per_img=200))
+
+backend_args = None
+
+train_pipeline = [
+    dict(type='LoadImageFromFileMono3D', backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox=True,
+        with_label=True,
+        with_attr_label=True,
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_bbox_depth=True),
+    dict(type='mmdet.Resize', scale=(1600, 900), keep_ratio=True),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='Pack3DDetInputs',
+        keys=[
+            'img', 'gt_bboxes', 'gt_bboxes_labels', 'attr_labels',
+            'gt_bboxes_3d', 'gt_labels_3d', 'centers_2d', 'depths'
+        ]),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFileMono3D', backend_args=backend_args),
+    dict(type='mmdet.Resize', scale_factor=1.0),
+    dict(type='Pack3DDetInputs', keys=['img']),
+]
+train_dataloader = dict(
+    batch_size=2, num_workers=2, dataset=dict(pipeline=train_pipeline))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(lr=0.004),
+    paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.),
+    clip_grad=dict(max_norm=35, norm_type=2))
+
+# learning policy
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0 / 3,
+        by_epoch=False,
+        begin=0,
+        end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=12,
+        by_epoch=True,
+        milestones=[8, 11],
+        gamma=0.1)
+]
+
+train_cfg = dict(max_epochs=12, val_interval=4)
+auto_scale_lr = dict(base_batch_size=32)
--- a/mmde/configs/pgd/pgd_r101-caffe_fpn_head-gn_16xb2-1x_nus-mono3d_finetune.py
+++ b/mmde/configs/pgd/pgd_r101-caffe_fpn_head-gn_16xb2-1x_nus-mono3d_finetune.py
+_base_ = './pgd_r101-caffe_fpn_head-gn_16xb2-1x_nus-mono3d.py'
+# model settings
+model = dict(
+    train_cfg=dict(code_weight=[
+        1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.05, 0.05, 0.2, 0.2, 0.2, 0.2
+    ]))
+# optimizer
+optim_wrapper = dict(optimizer=dict(lr=0.002))
+load_from = 'work_dirs/pgd_nus_benchmark_1x/latest.pth'
--- a/mmde/configs/pgd/pgd_r101-caffe_fpn_head-gn_16xb2-2x_nus-mono3d.py
+++ b/mmde/configs/pgd/pgd_r101-caffe_fpn_head-gn_16xb2-2x_nus-mono3d.py
+_base_ = './pgd_r101-caffe_fpn_head-gn_16xb2-1x_nus-mono3d.py'
+
+# learning policy
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0 / 3,
+        by_epoch=False,
+        begin=0,
+        end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=24,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
+
+train_cfg = dict(max_epochs=24)
--- a/mmde/configs/pgd/pgd_r101-caffe_fpn_head-gn_16xb2-2x_nus-mono3d_finetune.py
+++ b/mmde/configs/pgd/pgd_r101-caffe_fpn_head-gn_16xb2-2x_nus-mono3d_finetune.py
+_base_ = './pgd_r101-caffe_fpn_head-gn_16xb2-2x_nus-mono3d.py'
+# model settings
+model = dict(
+    train_cfg=dict(code_weight=[
+        1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.05, 0.05, 0.2, 0.2, 0.2, 0.2
+    ]))
+# optimizer
+optim_wrapper = dict(optimizer=dict(lr=0.002))
+load_from = 'work_dirs/pgd_nus_benchmark_2x/latest.pth'
--- a/mmde/configs/pgd/pgd_r101-caffe_fpn_head-gn_4xb3-4x_kitti-mono3d.py
+++ b/mmde/configs/pgd/pgd_r101-caffe_fpn_head-gn_4xb3-4x_kitti-mono3d.py
+_base_ = [
+    '../_base_/datasets/kitti-mono3d.py', '../_base_/models/pgd.py',
+    '../_base_/schedules/mmdet-schedule-1x.py', '../_base_/default_runtime.py'
+]
+# model settings
+model = dict(
+    data_preprocessor=dict(
+        type='Det3DDataPreprocessor',
+        mean=[103.530, 116.280, 123.675],
+        std=[1.0, 1.0, 1.0],
+        bgr_to_rgb=False,
+        pad_size_divisor=32),
+    backbone=dict(frozen_stages=0),
+    neck=dict(start_level=0, num_outs=4),
+    bbox_head=dict(
+        num_classes=3,
+        bbox_code_size=7,
+        pred_attrs=False,
+        pred_velo=False,
+        pred_bbox2d=True,
+        use_onlyreg_proj=True,
+        strides=(4, 8, 16, 32),
+        regress_ranges=((-1, 64), (64, 128), (128, 256), (256, 1e8)),
+        group_reg_dims=(2, 1, 3, 1, 16,
+                        4),  # offset, depth, size, rot, kpts, bbox2d
+        reg_branch=(
+            (256, ),  # offset
+            (256, ),  # depth
+            (256, ),  # size
+            (256, ),  # rot
+            (256, ),  # kpts
+            (256, )  # bbox2d
+        ),
+        centerness_branch=(256, ),
+        loss_cls=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+        loss_dir=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+        loss_centerness=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        use_depth_classifier=True,
+        depth_branch=(256, ),
+        depth_range=(0, 70),
+        depth_unit=10,
+        division='uniform',
+        depth_bins=8,
+        pred_keypoints=True,
+        weight_dim=1,
+        loss_depth=dict(
+            type='UncertainSmoothL1Loss', alpha=1.0, beta=3.0,
+            loss_weight=1.0),
+        bbox_coder=dict(
+            type='PGDBBoxCoder',
+            base_depths=((28.01, 16.32), ),
+            base_dims=((0.8, 1.73, 0.6), (1.76, 1.73, 0.6), (3.9, 1.56, 1.6)),
+            code_size=7)),
+    # set weight 1.0 for base 7 dims (offset, depth, size, rot)
+    # 0.2 for 16-dim keypoint offsets and 1.0 for 4-dim 2D distance targets
+    train_cfg=dict(code_weight=[
+        1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2,
+        0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 1.0, 1.0, 1.0, 1.0
+    ]),
+    test_cfg=dict(nms_pre=100, nms_thr=0.05, score_thr=0.001, max_per_img=20))
+
+backend_args = None
+
+train_pipeline = [
+    dict(type='LoadImageFromFileMono3D', backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox=True,
+        with_label=True,
+        with_attr_label=False,
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_bbox_depth=True),
+    dict(type='mmdet.Resize', scale=(1242, 375), keep_ratio=True),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(
+        type='Pack3DDetInputs',
+        keys=[
+            'img', 'gt_bboxes', 'gt_bboxes_labels', 'gt_bboxes_3d',
+            'gt_labels_3d', 'centers_2d', 'depths'
+        ]),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFileMono3D', backend_args=backend_args),
+    dict(type='mmdet.Resize', scale_factor=1.0),
+    dict(type='Pack3DDetInputs', keys=['img'])
+]
+
+train_dataloader = dict(
+    batch_size=3, num_workers=3, dataset=dict(pipeline=train_pipeline))
+test_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(lr=0.001),
+    paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.),
+    clip_grad=dict(max_norm=35, norm_type=2))
+
+# learning rate
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0 / 3,
+        by_epoch=False,
+        begin=0,
+        end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=48,
+        by_epoch=True,
+        milestones=[32, 44],
+        gamma=0.1)
+]
+
+train_cfg = dict(max_epochs=48, val_interval=2)
+auto_scale_lr = dict(base_batch_size=12)
--- a/mmde/configs/pgd/pgd_r101_fpn-head_dcn_16xb3_waymoD5-fov-mono3d.py
+++ b/mmde/configs/pgd/pgd_r101_fpn-head_dcn_16xb3_waymoD5-fov-mono3d.py
+_base_ = [
+    '../_base_/datasets/waymoD5-fov-mono3d-3class.py',
+    '../_base_/models/pgd.py', '../_base_/schedules/mmdet-schedule-1x.py',
+    '../_base_/default_runtime.py'
+]
+# model settings
+model = dict(
+    backbone=dict(
+        type='mmdet.ResNet',
+        depth=101,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'),
+        dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, False, True, True)),
+    neck=dict(num_outs=3),
+    bbox_head=dict(
+        num_classes=3,
+        bbox_code_size=7,
+        pred_attrs=False,
+        pred_velo=False,
+        pred_bbox2d=True,
+        use_onlyreg_proj=True,
+        strides=(8, 16, 32),
+        regress_ranges=((-1, 128), (128, 256), (256, 1e8)),
+        group_reg_dims=(2, 1, 3, 1, 16,
+                        4),  # offset, depth, size, rot, kpts, bbox2d
+        reg_branch=(
+            (256, ),  # offset
+            (256, ),  # depth
+            (256, ),  # size
+            (256, ),  # rot
+            (256, ),  # kpts
+            (256, )  # bbox2d
+        ),
+        centerness_branch=(256, ),
+        loss_cls=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+        loss_dir=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+        loss_centerness=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        use_depth_classifier=True,
+        depth_branch=(256, ),
+        depth_range=(0, 50),
+        depth_unit=10,
+        division='uniform',
+        depth_bins=6,
+        pred_keypoints=True,
+        weight_dim=1,
+        loss_depth=dict(
+            type='UncertainSmoothL1Loss', alpha=1.0, beta=3.0,
+            loss_weight=1.0),
+        loss_bbox2d=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=0.0),
+        loss_consistency=dict(type='mmdet.GIoULoss', loss_weight=0.0),
+        bbox_coder=dict(
+            type='PGDBBoxCoder',
+            base_depths=((41.01, 18.44), ),
+            base_dims=(
+                (4.73, 1.77, 2.08),  # Car
+                (0.91, 1.74, 0.84),  # Pedestrian
+                (1.81, 1.77, 0.84),  # Cyclist
+            ),
+            code_size=7)),
+    # set weight 1.0 for base 7 dims (offset, depth, size, rot)
+    # 0.2 for 16-dim keypoint offsets and 1.0 for 4-dim 2D distance targets
+    train_cfg=dict(code_weight=[
+        1.0, 1.0, 0.2, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2,
+        0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 1.0, 1.0, 1.0, 1.0
+    ]),
+    test_cfg=dict(nms_pre=100, nms_thr=0.05, score_thr=0.001, max_per_img=20))
+
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(
+        type='SGD',
+        lr=0.008,
+    ),
+    paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.),
+    clip_grad=dict(max_norm=35, norm_type=2))
+
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0 / 3,
+        by_epoch=False,
+        begin=0,
+        end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=24,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
+
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=24, val_interval=24)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+auto_scale_lr = dict(base_batch_size=48)
--- a/mmde/configs/pgd/pgd_r101_fpn-head_dcn_16xb3_waymoD5-mv-mono3d.py
+++ b/mmde/configs/pgd/pgd_r101_fpn-head_dcn_16xb3_waymoD5-mv-mono3d.py
+_base_ = [
+    '../_base_/datasets/waymoD5-mv-mono3d-3class.py',
+    '../_base_/models/pgd.py', '../_base_/schedules/mmdet-schedule-1x.py',
+    '../_base_/default_runtime.py'
+]
+# model settings
+model = dict(
+    backbone=dict(
+        type='mmdet.ResNet',
+        depth=101,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'),
+        dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, False, True, True)),
+    neck=dict(num_outs=3),
+    bbox_head=dict(
+        num_classes=3,
+        bbox_code_size=7,
+        pred_attrs=False,
+        pred_velo=False,
+        pred_bbox2d=True,
+        use_onlyreg_proj=True,
+        strides=(8, 16, 32),
+        regress_ranges=((-1, 128), (128, 256), (256, 1e8)),
+        group_reg_dims=(2, 1, 3, 1, 16,
+                        4),  # offset, depth, size, rot, kpts, bbox2d
+        reg_branch=(
+            (256, ),  # offset
+            (256, ),  # depth
+            (256, ),  # size
+            (256, ),  # rot
+            (256, ),  # kpts
+            (256, )  # bbox2d
+        ),
+        centerness_branch=(256, ),
+        loss_cls=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+        loss_dir=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+        loss_centerness=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        use_depth_classifier=True,
+        depth_branch=(256, ),
+        depth_range=(0, 50),
+        depth_unit=10,
+        division='uniform',
+        depth_bins=6,
+        pred_keypoints=True,
+        weight_dim=1,
+        loss_depth=dict(
+            type='UncertainSmoothL1Loss', alpha=1.0, beta=3.0,
+            loss_weight=1.0),
+        loss_bbox2d=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=0.0),
+        loss_consistency=dict(type='mmdet.GIoULoss', loss_weight=0.0),
+        bbox_coder=dict(
+            type='PGDBBoxCoder',
+            base_depths=((41.01, 18.44), ),
+            base_dims=(
+                (4.73, 1.77, 2.08),
+                (0.91, 1.74, 0.84),
+                (1.81, 1.77, 0.84),
+            ),
+            code_size=7)),
+    # set weight 1.0 for base 7 dims (offset, depth, size, rot)
+    # 0.2 for 16-dim keypoint offsets and 1.0 for 4-dim 2D distance targets
+    train_cfg=dict(code_weight=[
+        1.0, 1.0, 0.2, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2,
+        0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 1.0, 1.0, 1.0, 1.0
+    ]),
+    test_cfg=dict(nms_pre=100, nms_thr=0.05, score_thr=0.001, max_per_img=20))
+
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(
+        type='SGD',
+        lr=0.008,
+    ),
+    paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.),
+    clip_grad=dict(max_norm=35, norm_type=2))
+
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0 / 3,
+        by_epoch=False,
+        begin=0,
+        end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=24,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
+
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=24, val_interval=24)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+auto_scale_lr = dict(base_batch_size=48)
--- a/mmde/configs/pgd/pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-fov-mono3d.py
+++ b/mmde/configs/pgd/pgd_r101_fpn_gn-head_dcn_8xb3-2x_waymoD3-fov-mono3d.py
+_base_ = [
+    '../_base_/datasets/waymoD3-fov-mono3d-3class.py',
+    '../_base_/models/pgd.py', '../_base_/schedules/mmdet-schedule-1x.py',
+    '../_base_/default_runtime.py'
+]
+# load_from = '../Depth-from-Motion/checkpoints/pgd_init.pth'
+# model settings
+model = dict(
+    backbone=dict(
+        type='mmdet.ResNet',
+        depth=101,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'),
+        dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
+        stage_with_dcn=(False, False, True, True)),
+    neck=dict(num_outs=3),
+    bbox_head=dict(
+        num_classes=3,
+        bbox_code_size=7,
+        pred_attrs=False,
+        pred_velo=False,
+        pred_bbox2d=True,
+        use_onlyreg_proj=True,
+        strides=(8, 16, 32),
+        regress_ranges=((-1, 128), (128, 256), (256, 1e8)),
+        group_reg_dims=(2, 1, 3, 1, 16,
+                        4),  # offset, depth, size, rot, kpts, bbox2d
+        reg_branch=(
+            (256, ),  # offset
+            (256, ),  # depth
+            (256, ),  # size
+            (256, ),  # rot
+            (256, ),  # kpts
+            (256, )  # bbox2d
+        ),
+        centerness_branch=(256, ),
+        loss_cls=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
+        loss_dir=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+        loss_centerness=dict(
+            type='mmdet.CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        use_depth_classifier=True,
+        depth_branch=(256, ),
+        depth_range=(0, 50),
+        depth_unit=10,
+        division='uniform',
+        depth_bins=6,
+        pred_keypoints=True,
+        weight_dim=1,
+        loss_depth=dict(
+            type='UncertainSmoothL1Loss', alpha=1.0, beta=3.0,
+            loss_weight=1.0),
+        loss_bbox2d=dict(
+            type='mmdet.SmoothL1Loss', beta=1.0 / 9.0, loss_weight=0.0),
+        loss_consistency=dict(type='mmdet.GIoULoss', loss_weight=0.0),
+        bbox_coder=dict(
+            type='PGDBBoxCoder',
+            base_depths=((41.01, 18.44), ),
+            base_dims=(
+                (0.91, 1.74, 0.84),  # Pedestrian
+                (1.81, 1.77, 0.84),  # Cyclist
+                (4.73, 1.77, 2.08)),  # Car
+            code_size=7)),
+    # set weight 1.0 for base 7 dims (offset, depth, size, rot)
+    # 0.2 for 16-dim keypoint offsets and 1.0 for 4-dim 2D distance targets
+    train_cfg=dict(code_weight=[
+        1.0, 1.0, 0.2, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2,
+        0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 1.0, 1.0, 1.0, 1.0
+    ]),
+    test_cfg=dict(nms_pre=100, nms_thr=0.05, score_thr=0.001, max_per_img=20))
+
+# optimizer
+optim_wrapper = dict(
+    optimizer=dict(
+        type='SGD',
+        lr=0.008,
+    ),
+    paramwise_cfg=dict(bias_lr_mult=2., bias_decay_mult=0.),
+    clip_grad=dict(max_norm=35, norm_type=2))
+
+param_scheduler = [
+    dict(
+        type='LinearLR',
+        start_factor=1.0 / 3,
+        by_epoch=False,
+        begin=0,
+        end=500),
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=24,
+        by_epoch=True,
+        milestones=[16, 22],
+        gamma=0.1)
+]
+
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=24, val_interval=24)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+auto_scale_lr = dict(enable=False, base_batch_size=48)