Merge pull request #2867 from open-mmlab/dev-1.x

Bump version to 1.4.0

Merge pull request #2867 from open-mmlab/dev-1.x
Bump version to 1.4.0
fe25f7a5 · Wenwei Zhang · GitHub · 5c0613be · 0ef13b83 · fe25f7a5
Unverified Commit fe25f7a5 authored Jan 08, 2024 by Wenwei Zhang Committed by GitHub Jan 08, 2024
20 changed files
--- a/mmdet3d/structures/bbox_3d/base_box3d.py
+++ b/mmdet3d/structures/bbox_3d/base_box3d.py
@@ -275,12 +275,13 @@ class BaseInstance3DBoxes:
            Tensor: A binary vector indicating whether each point is inside the
            reference range.
        """
-        in_range_flags = ((self.tensor[:, 0] > box_range[0])
+        gravity_center = self.gravity_center
-                          & (self.tensor[:, 1] > box_range[1])
+        in_range_flags = ((gravity_center[:, 0] > box_range[0])
-                          & (self.tensor[:, 2] > box_range[2])
+                          & (gravity_center[:, 1] > box_range[1])
-                          & (self.tensor[:, 0] < box_range[3])
+                          & (gravity_center[:, 2] > box_range[2])
-                          & (self.tensor[:, 1] < box_range[4])
+                          & (gravity_center[:, 0] < box_range[3])
-                          & (self.tensor[:, 2] < box_range[5]))
+                          & (gravity_center[:, 1] < box_range[4])
+                          & (gravity_center[:, 2] < box_range[5]))
        return in_range_flags
    @abstractmethod

--- a/mmdet3d/version.py
+++ b/mmdet3d/version.py
 # Copyright (c) Open-MMLab. All rights reserved.
-__version__ = '1.3.0'
+__version__ = '1.4.0'
 short_version = __version__

--- a/projects/CenterFormer/configs/centerformer_voxel01_second-attn_secfpn-attn_4xb4-cyclic-20e_waymoD5-3d-3class.py
+++ b/projects/CenterFormer/configs/centerformer_voxel01_second-attn_secfpn-attn_4xb4-cyclic-20e_waymoD5-3d-3class.py
@@ -179,7 +179,10 @@ test_pipeline = [
            dict(
                type='PointsRangeFilter', point_cloud_range=point_cloud_range)
        ]),
-    dict(type='Pack3DDetInputs', keys=['points'])
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points'],
+        meta_keys=['box_type_3d', 'sample_idx', 'context_name', 'timestamp'])
 ]
 dataset_type = 'WaymoDataset'
@@ -223,13 +226,7 @@ val_dataloader = dict(
 test_dataloader = val_dataloader
 val_evaluator = dict(
-    type='WaymoMetric',
+    type='WaymoMetric', waymo_bin_file='./data/waymo/waymo_format/gt.bin')
-    ann_file='./data/waymo/kitti_format/waymo_infos_val.pkl',
-    waymo_bin_file='./data/waymo/waymo_format/gt.bin',
-    data_root='./data/waymo/waymo_format',
-    backend_args=backend_args,
-    convert_kitti_format=False,
-    idx2metainfo='./data/waymo/waymo_format/idx2metainfo.pkl')
 test_evaluator = val_evaluator
 vis_backends = [dict(type='LocalVisBackend')]

--- a/projects/DSVT/README.md
+++ b/projects/DSVT/README.md
@@ -57,17 +57,25 @@ python tools/test.py projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-
 ### Training commands
-The support of training DSVT is on the way.
+In MMDetection3D's root directory, run the following command to test the model:
+```bash
+tools/dist_train.sh projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py 8 --sync_bn torch
+```
 ## Results and models
 ### Waymo
-|                                     Middle Encoder                                     |                                          Backbone                                           | Load Interval | Voxel type (voxel size) | Multi-Class NMS | Multi-frames | Mem (GB) | Inf time (fps) | mAP@L1 | mAPH@L1 | mAP@L2 | **mAPH@L2** | Download |
+|                                     Middle Encoder                                     |                                          Backbone                                           | Load Interval | Voxel type (voxel size) | Multi-Class NMS | Multi-frames | mAP@L1 | mAPH@L1 | mAP@L2 | **mAPH@L2** |                                                                           Download                                                                           |
-| :------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------: | :-----------: | :---------------------: | :-------------: | :----------: | :------: | :------------: | :----: | :-----: | :----: | :---------: | :------: |
+| :------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------: | :-----------: | :---------------------: | :-------------: | :----------: | :----: | :-----: | :----: | :---------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------: |
-| [DSVT](./configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py) | [ResSECOND](./configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py) |       5       |      voxel (0.32)       |        ✓        |      ×       |          |                |  75.2  |  72.2   |  68.9  |    66.1     |          |
+| [DSVT](./configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py) | [ResSECOND](./configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py) |       5       |      voxel (0.32)       |        ✓        |      ×       |  75.5  |  72.4   |  69.2  |    66.3     | [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/dsvt/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class_20230917_102130.log) |
+**Note**:
+- `ResSECOND` denotes the base block in SECOND has residual layers.
-**Note** that `ResSECOND` denotes the base block in SECOND has residual layers.
+- Regrettably, we are unable to provide the pre-trained model weights due to [Waymo Dataset License Agreement](https://waymo.com/open/terms/), so we only provide the training logs as shown above.
 ## Citation

--- a/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py
+++ b/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py
@@ -88,25 +88,28 @@ model = dict(
        loss_cls=dict(
            type='mmdet.GaussianFocalLoss', reduction='mean', loss_weight=1.0),
        loss_bbox=dict(type='mmdet.L1Loss', reduction='mean', loss_weight=2.0),
+        loss_iou=dict(type='mmdet.L1Loss', reduction='sum', loss_weight=1.0),
+        loss_reg_iou=dict(
+            type='mmdet3d.DIoU3DLoss', reduction='mean', loss_weight=2.0),
        norm_bbox=True),
    # model training and testing settings
    train_cfg=dict(
-        pts=dict(
+        grid_size=grid_size,
-            grid_size=grid_size,
+        voxel_size=voxel_size,
-            voxel_size=voxel_size,
+        point_cloud_range=point_cloud_range,
-            out_size_factor=4,
+        out_size_factor=1,
-            dense_reg=1,
+        dense_reg=1,
-            gaussian_overlap=0.1,
+        gaussian_overlap=0.1,
-            max_objs=500,
+        max_objs=500,
-            min_radius=2,
+        min_radius=2,
-            code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])),
+        code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]),
    test_cfg=dict(
        max_per_img=500,
        max_pool_nms=False,
        min_radius=[4, 12, 10, 1, 0.85, 0.175],
        iou_rectifier=[[0.68, 0.71, 0.65]],
        pc_range=[-80, -80],
-        out_size_factor=4,
+        out_size_factor=1,
        voxel_size=voxel_size[:2],
        nms_type='rotate',
        multi_class_nms=True,
@@ -128,6 +131,8 @@ db_sampler = dict(
        coord_type='LIDAR',
        load_dim=6,
        use_dim=[0, 1, 2, 3, 4],
+        norm_intensity=True,
+        norm_elongation=True,
        backend_args=backend_args),
    backend_args=backend_args)
@@ -138,25 +143,22 @@ train_pipeline = [
        load_dim=6,
        use_dim=5,
        norm_intensity=True,
+        norm_elongation=True,
        backend_args=backend_args),
-    # Add this if using `MultiFrameDeformableDecoderRPN`
-    # dict(
-    #     type='LoadPointsFromMultiSweeps',
-    #     sweeps_num=9,
-    #     load_dim=6,
-    #     use_dim=[0, 1, 2, 3, 4],
-    #     pad_empty_sweeps=True,
-    #     remove_close=True),
    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
    dict(type='ObjectSample', db_sampler=db_sampler),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
    dict(
        type='GlobalRotScaleTrans',
        rot_range=[-0.78539816, 0.78539816],
        scale_ratio_range=[0.95, 1.05],
-        translation_std=[0.5, 0.5, 0]),
+        translation_std=[0.5, 0.5, 0.5]),
-    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='PointsRangeFilter3D', point_cloud_range=point_cloud_range),
-    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter3D', point_cloud_range=point_cloud_range),
-    dict(type='ObjectNameFilter', classes=class_names),
    dict(type='PointShuffle'),
    dict(
        type='Pack3DDetInputs',
@@ -172,25 +174,34 @@ test_pipeline = [
        norm_intensity=True,
        norm_elongation=True,
        backend_args=backend_args),
+    dict(type='PointsRangeFilter3D', point_cloud_range=point_cloud_range),
    dict(
-        type='MultiScaleFlipAug3D',
+        type='Pack3DDetInputs',
-        img_scale=(1333, 800),
+        keys=['points'],
-        pts_scale_ratio=1,
+        meta_keys=['box_type_3d', 'sample_idx', 'context_name', 'timestamp'])
-        flip=False,
-        transforms=[
-            dict(
-                type='GlobalRotScaleTrans',
-                rot_range=[0, 0],
-                scale_ratio_range=[1., 1.],
-                translation_std=[0, 0, 0]),
-            dict(type='RandomFlip3D'),
-            dict(
-                type='PointsRangeFilter', point_cloud_range=point_cloud_range)
-        ]),
-    dict(type='Pack3DDetInputs', keys=['points'])
 ]
 dataset_type = 'WaymoDataset'
+train_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='waymo_infos_train.pkl',
+        data_prefix=dict(pts='training/velodyne', sweeps='training/velodyne'),
+        pipeline=train_pipeline,
+        modality=input_modality,
+        test_mode=False,
+        metainfo=metainfo,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR',
+        # load one frame every five frames
+        load_interval=5,
+        backend_args=backend_args))
 val_dataloader = dict(
    batch_size=4,
    num_workers=4,
@@ -212,18 +223,59 @@ test_dataloader = val_dataloader
 val_evaluator = dict(
    type='WaymoMetric',
-    ann_file='./data/waymo/kitti_format/waymo_infos_val.pkl',
    waymo_bin_file='./data/waymo/waymo_format/gt.bin',
-    data_root='./data/waymo/waymo_format',
+    result_prefix='./dsvt_pred')
-    backend_args=backend_args,
-    convert_kitti_format=False,
-    idx2metainfo='./data/waymo/waymo_format/idx2metainfo.pkl')
 test_evaluator = val_evaluator
 vis_backends = [dict(type='LocalVisBackend')]
 visualizer = dict(
    type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+# schedules
+lr = 1e-5
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=lr, weight_decay=0.05, betas=(0.9, 0.99)),
+    clip_grad=dict(max_norm=10, norm_type=2))
+param_scheduler = [
+    dict(
+        type='CosineAnnealingLR',
+        T_max=1.2,
+        eta_min=lr * 100,
+        begin=0,
+        end=1.2,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=10.8,
+        eta_min=lr * 1e-4,
+        begin=1.2,
+        end=12,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    # momentum scheduler
+    dict(
+        type='CosineAnnealingMomentum',
+        T_max=1.2,
+        eta_min=0.85,
+        begin=0,
+        end=1.2,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingMomentum',
+        T_max=10.8,
+        eta_min=0.95,
+        begin=1.2,
+        end=12,
+        by_epoch=True,
+        convert_to_iter_based=True)
+]
+# runtime settings
+train_cfg = dict(by_epoch=True, max_epochs=12, val_interval=1)
 # runtime settings
 val_cfg = dict()
 test_cfg = dict()
@@ -236,4 +288,12 @@ test_cfg = dict()
 default_hooks = dict(
    logger=dict(type='LoggerHook', interval=50),
-    checkpoint=dict(type='CheckpointHook', interval=5))
+    checkpoint=dict(type='CheckpointHook', interval=1))
+custom_hooks = [
+    dict(
+        type='DisableAugHook',
+        disable_after_epoch=11,
+        disable_aug_list=[
+            'GlobalRotScaleTrans', 'RandomFlip3D', 'ObjectSample'
+        ])
+]
--- a/projects/DSVT/dsvt/__init__.py
+++ b/projects/DSVT/dsvt/__init__.py
+from .disable_aug_hook import DisableAugHook
 from .dsvt import DSVT
 from .dsvt_head import DSVTCenterHead
 from .dsvt_transformer import DSVTMiddleEncoder
 from .dynamic_pillar_vfe import DynamicPillarVFE3D
 from .map2bev import PointPillarsScatter3D
 from .res_second import ResSECOND
+from .transforms_3d import ObjectRangeFilter3D, PointsRangeFilter3D
 from .utils import DSVTBBoxCoder
 __all__ = [
    'DSVTCenterHead', 'DSVT', 'DSVTMiddleEncoder', 'DynamicPillarVFE3D',
-    'PointPillarsScatter3D', 'ResSECOND', 'DSVTBBoxCoder'
+    'PointPillarsScatter3D', 'ResSECOND', 'DSVTBBoxCoder',
+    'ObjectRangeFilter3D', 'PointsRangeFilter3D', 'DisableAugHook'
 ]
--- a/projects/DSVT/dsvt/disable_aug_hook.py
+++ b/projects/DSVT/dsvt/disable_aug_hook.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+from mmengine.dataset import BaseDataset
+from mmengine.hooks import Hook
+from mmengine.model import is_model_wrapper
+from mmengine.runner import Runner
+from mmdet3d.registry import HOOKS
+@HOOKS.register_module()
+class DisableAugHook(Hook):
+    """The hook of disabling augmentations during training.
+    Args:
+        disable_after_epoch (int): The number of epochs after which
+            the data augmentation will be closed in the training.
+            Defaults to 15.
+        disable_aug_list (list): the list of data augmentation will
+            be closed in the training. Defaults to [].
+    """
+    def __init__(self,
+                 disable_after_epoch: int = 15,
+                 disable_aug_list: List = []):
+        self.disable_after_epoch = disable_after_epoch
+        self.disable_aug_list = disable_aug_list
+        self._restart_dataloader = False
+    def before_train_epoch(self, runner: Runner):
+        """Close augmentation.
+        Args:
+            runner (Runner): The runner.
+        """
+        epoch = runner.epoch
+        train_loader = runner.train_dataloader
+        model = runner.model
+        # TODO: refactor after mmengine using model wrapper
+        if is_model_wrapper(model):
+            model = model.module
+        if epoch == self.disable_after_epoch:
+            dataset = runner.train_dataloader.dataset
+            # handle dataset wrapper
+            if not isinstance(dataset, BaseDataset):
+                dataset = dataset.dataset
+            new_transforms = []
+            for transform in dataset.pipeline.transforms:  # noqa: E501
+                if transform.__class__.__name__ not in self.disable_aug_list:
+                    new_transforms.append(transform)
+                else:
+                    runner.logger.info(
+                        f'Disable {transform.__class__.__name__}')
+            dataset.pipeline.transforms = new_transforms
+            # The dataset pipeline cannot be updated when persistent_workers
+            # is True, so we need to force the dataloader's multi-process
+            # restart. This is a very hacky approach.
+            if hasattr(train_loader, 'persistent_workers'
+                       ) and train_loader.persistent_workers is True:
+                train_loader._DataLoader__initialized = False
+                train_loader._iterator = None
+                self._restart_dataloader = True
+        else:
+            # Once the restart is complete, we need to restore
+            # the initialization flag.
+            if self._restart_dataloader:
+                train_loader._DataLoader__initialized = True
--- a/projects/DSVT/dsvt/dsvt.py
+++ b/projects/DSVT/dsvt/dsvt.py
@@ -103,7 +103,11 @@ class DSVT(Base3DDetector):
        Returns:
            dict[str, Tensor]: A dictionary of loss components.
        """
-        pass
+        pts_feats = self.extract_feat(batch_inputs_dict)
+        losses = dict()
+        loss = self.bbox_head.loss(pts_feats, batch_data_samples)
+        losses.update(loss)
+        return losses
    def predict(self, batch_inputs_dict: Dict[str, Optional[Tensor]],
                batch_data_samples: List[Det3DDataSample],

--- a/projects/DSVT/dsvt/dsvt_head.py
+++ b/projects/DSVT/dsvt/dsvt_head.py
+import math
 from typing import Dict, List, Tuple
 import torch
+import torch.nn as nn
+from mmcv.ops import boxes_iou3d
 from mmdet.models.utils import multi_apply
+from mmengine.model import kaiming_init
 from mmengine.structures import InstanceData
 from torch import Tensor
+from torch.nn.init import constant_
 from mmdet3d.models import CenterHead
 from mmdet3d.models.layers import circle_nms, nms_bev
+from mmdet3d.models.utils import (clip_sigmoid, draw_heatmap_gaussian,
+                                  gaussian_radius)
 from mmdet3d.registry import MODELS
 from mmdet3d.structures import Det3DDataSample, xywhr2xyxyr
@@ -18,8 +25,33 @@ class DSVTCenterHead(CenterHead):
    This head adds IoU prediction branch based on the original CenterHead.
    """
-    def __init__(self, *args, **kwargs):
+    def __init__(self,
+                 loss_iou=dict(
+                     type='mmdet.L1Loss', reduction='mean', loss_weight=1),
+                 loss_reg_iou=None,
+                 *args,
+                 **kwargs):
        super(DSVTCenterHead, self).__init__(*args, **kwargs)
+        self.loss_iou = MODELS.build(loss_iou)
+        self.loss_iou_reg = MODELS.build(
+            loss_reg_iou) if loss_reg_iou is not None else None
+    def init_weights(self):
+        kaiming_init(
+            self.shared_conv.conv,
+            a=math.sqrt(5),
+            mode='fan_in',
+            nonlinearity='leaky_relu',
+            distribution='uniform')
+        for head in self.task_heads[0].heads:
+            if head == 'heatmap':
+                constant_(self.task_heads[0].__getattr__(head)[-1].bias,
+                          self.task_heads[0].init_bias)
+            else:
+                for m in self.task_heads[0].__getattr__(head).modules():
+                    if isinstance(m, nn.Conv2d):
+                        kaiming_init(
+                            m, mode='fan_in', nonlinearity='leaky_relu')
    def forward_single(self, x: Tensor) -> dict:
        """Forward function for CenterPoint.
@@ -66,7 +98,298 @@ class DSVTCenterHead(CenterHead):
        Returns:
            dict: Losses of each branch.
        """
-        pass
+        outs = self(pts_feats)
+        batch_gt_instance_3d = []
+        for data_sample in batch_data_samples:
+            batch_gt_instance_3d.append(data_sample.gt_instances_3d)
+        losses = self.loss_by_feat(outs, batch_gt_instance_3d)
+        return losses
+    def _decode_all_preds(self,
+                          pred_dict,
+                          point_cloud_range=None,
+                          voxel_size=None):
+        batch_size, _, H, W = pred_dict['reg'].shape
+        batch_center = pred_dict['reg'].permute(0, 2, 3, 1).contiguous().view(
+            batch_size, H * W, 2)  # (B, H, W, 2)
+        batch_center_z = pred_dict['height'].permute(
+            0, 2, 3, 1).contiguous().view(batch_size, H * W, 1)  # (B, H, W, 1)
+        batch_dim = pred_dict['dim'].exp().permute(
+            0, 2, 3, 1).contiguous().view(batch_size, H * W, 3)  # (B, H, W, 3)
+        batch_rot_cos = pred_dict['rot'][:, 0].unsqueeze(dim=1).permute(
+            0, 2, 3, 1).contiguous().view(batch_size, H * W, 1)  # (B, H, W, 1)
+        batch_rot_sin = pred_dict['rot'][:, 1].unsqueeze(dim=1).permute(
+            0, 2, 3, 1).contiguous().view(batch_size, H * W, 1)  # (B, H, W, 1)
+        batch_vel = pred_dict['vel'].permute(0, 2, 3, 1).contiguous().view(
+            batch_size, H * W, 2) if 'vel' in pred_dict.keys() else None
+        angle = torch.atan2(batch_rot_sin, batch_rot_cos)  # (B, H*W, 1)
+        ys, xs = torch.meshgrid([
+            torch.arange(
+                0, H, device=batch_center.device, dtype=batch_center.dtype),
+            torch.arange(
+                0, W, device=batch_center.device, dtype=batch_center.dtype)
+        ])
+        ys = ys.view(1, H, W).repeat(batch_size, 1, 1)
+        xs = xs.view(1, H, W).repeat(batch_size, 1, 1)
+        xs = xs.view(batch_size, -1, 1) + batch_center[:, :, 0:1]
+        ys = ys.view(batch_size, -1, 1) + batch_center[:, :, 1:2]
+        xs = xs * voxel_size[0] + point_cloud_range[0]
+        ys = ys * voxel_size[1] + point_cloud_range[1]
+        box_part_list = [xs, ys, batch_center_z, batch_dim, angle]
+        if batch_vel is not None:
+            box_part_list.append(batch_vel)
+        box_preds = torch.cat((box_part_list),
+                              dim=-1).view(batch_size, H, W, -1)
+        return box_preds
+    def _transpose_and_gather_feat(self, feat, ind):
+        feat = feat.permute(0, 2, 3, 1).contiguous()
+        feat = feat.view(feat.size(0), -1, feat.size(3))
+        feat = self._gather_feat(feat, ind)
+        return feat
+    def calc_iou_loss(self, iou_preds, batch_box_preds, mask, ind, gt_boxes):
+        """
+        Args:
+            iou_preds: (batch x 1 x h x w)
+            batch_box_preds: (batch x (7 or 9) x h x w)
+            mask: (batch x max_objects)
+            ind: (batch x max_objects)
+            gt_boxes: List of batch groundtruth boxes.
+        Returns:
+            Tensor: IoU Loss.
+        """
+        if mask.sum() == 0:
+            return iou_preds.new_zeros((1))
+        mask = mask.bool()
+        selected_iou_preds = self._transpose_and_gather_feat(iou_preds,
+                                                             ind)[mask]
+        selected_box_preds = self._transpose_and_gather_feat(
+            batch_box_preds, ind)[mask]
+        gt_boxes = torch.cat(gt_boxes)
+        assert gt_boxes.size(0) == selected_box_preds.size(0)
+        iou_target = boxes_iou3d(selected_box_preds[:, 0:7], gt_boxes[:, 0:7])
+        iou_target = torch.diag(iou_target).view(-1)
+        iou_target = iou_target * 2 - 1  # [0, 1] ==> [-1, 1]
+        loss = self.loss_iou(selected_iou_preds.view(-1), iou_target)
+        loss = loss / torch.clamp(mask.sum(), min=1e-4)
+        return loss
+    def calc_iou_reg_loss(self, batch_box_preds, mask, ind, gt_boxes):
+        if mask.sum() == 0:
+            return batch_box_preds.new_zeros((1))
+        mask = mask.bool()
+        selected_box_preds = self._transpose_and_gather_feat(
+            batch_box_preds, ind)[mask]
+        gt_boxes = torch.cat(gt_boxes)
+        assert gt_boxes.size(0) == selected_box_preds.size(0)
+        loss = self.loss_iou_reg(selected_box_preds[:, 0:7], gt_boxes[:, 0:7])
+        return loss
+    def get_targets(
+        self,
+        batch_gt_instances_3d: List[InstanceData],
+    ) -> Tuple[List[Tensor]]:
+        """Generate targets.
+        How each output is transformed:
+            Each nested list is transposed so that all same-index elements in
+            each sub-list (1, ..., N) become the new sub-lists.
+                [ [a0, a1, a2, ... ], [b0, b1, b2, ... ], ... ]
+                ==> [ [a0, b0, ... ], [a1, b1, ... ], [a2, b2, ... ] ]
+            The new transposed nested list is converted into a list of N
+            tensors generated by concatenating tensors in the new sub-lists.
+                [ tensor0, tensor1, tensor2, ... ]
+        Args:
+            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
+                gt_instances. It usually includes ``bboxes_3d`` and\
+                ``labels_3d`` attributes.
+        Returns:
+            Returns:
+                tuple[list[torch.Tensor]]: Tuple of target including
+                    the following results in order.
+                - list[torch.Tensor]: Heatmap scores.
+                - list[torch.Tensor]: Ground truth boxes.
+                - list[torch.Tensor]: Indexes indicating the
+                    position of the valid boxes.
+                - list[torch.Tensor]: Masks indicating which
+                    boxes are valid.
+        """
+        heatmaps, anno_boxes, inds, masks, task_gt_bboxes = multi_apply(
+            self.get_targets_single, batch_gt_instances_3d)
+        # Transpose heatmaps
+        heatmaps = list(map(list, zip(*heatmaps)))
+        heatmaps = [torch.stack(hms_) for hms_ in heatmaps]
+        # Transpose anno_boxes
+        anno_boxes = list(map(list, zip(*anno_boxes)))
+        anno_boxes = [torch.stack(anno_boxes_) for anno_boxes_ in anno_boxes]
+        # Transpose inds
+        inds = list(map(list, zip(*inds)))
+        inds = [torch.stack(inds_) for inds_ in inds]
+        # Transpose masks
+        masks = list(map(list, zip(*masks)))
+        masks = [torch.stack(masks_) for masks_ in masks]
+        # Transpose task_gt_bboxes
+        task_gt_bboxes = list(map(list, zip(*task_gt_bboxes)))
+        return heatmaps, anno_boxes, inds, masks, task_gt_bboxes
+    def get_targets_single(self,
+                           gt_instances_3d: InstanceData) -> Tuple[Tensor]:
+        """Generate training targets for a single sample.
+        Args:
+            gt_instances_3d (:obj:`InstanceData`): Gt_instances_3d of
+                single data sample. It usually includes
+                ``bboxes_3d`` and ``labels_3d`` attributes.
+        Returns:
+            tuple[list[torch.Tensor]]: Tuple of target including
+                the following results in order.
+                - list[torch.Tensor]: Heatmap scores.
+                - list[torch.Tensor]: Ground truth boxes.
+                - list[torch.Tensor]: Indexes indicating the position
+                    of the valid boxes.
+                - list[torch.Tensor]: Masks indicating which boxes
+                    are valid.
+        """
+        gt_labels_3d = gt_instances_3d.labels_3d
+        gt_bboxes_3d = gt_instances_3d.bboxes_3d
+        device = gt_labels_3d.device
+        gt_bboxes_3d = torch.cat(
+            (gt_bboxes_3d.gravity_center, gt_bboxes_3d.tensor[:, 3:]),
+            dim=1).to(device)
+        max_objs = self.train_cfg['max_objs'] * self.train_cfg['dense_reg']
+        grid_size = torch.tensor(self.train_cfg['grid_size']).to(device)
+        pc_range = torch.tensor(self.train_cfg['point_cloud_range'])
+        voxel_size = torch.tensor(self.train_cfg['voxel_size'])
+        feature_map_size = grid_size[:2] // self.train_cfg['out_size_factor']
+        # reorganize the gt_dict by tasks
+        task_masks = []
+        flag = 0
+        for class_name in self.class_names:
+            task_masks.append([
+                torch.where(gt_labels_3d == class_name.index(i) + flag)
+                for i in class_name
+            ])
+            flag += len(class_name)
+        task_boxes = []
+        task_classes = []
+        flag2 = 0
+        for idx, mask in enumerate(task_masks):
+            task_box = []
+            task_class = []
+            for m in mask:
+                task_box.append(gt_bboxes_3d[m])
+                # 0 is background for each task, so we need to add 1 here.
+                task_class.append(gt_labels_3d[m] + 1 - flag2)
+            task_boxes.append(torch.cat(task_box, axis=0).to(device))
+            task_classes.append(torch.cat(task_class).long().to(device))
+            flag2 += len(mask)
+        draw_gaussian = draw_heatmap_gaussian
+        heatmaps, anno_boxes, inds, masks = [], [], [], []
+        for idx, task_head in enumerate(self.task_heads):
+            heatmap = gt_bboxes_3d.new_zeros(
+                (len(self.class_names[idx]), feature_map_size[1],
+                 feature_map_size[0]))
+            anno_box = gt_bboxes_3d.new_zeros((max_objs, 8),
+                                              dtype=torch.float32)
+            ind = gt_labels_3d.new_zeros((max_objs), dtype=torch.int64)
+            mask = gt_bboxes_3d.new_zeros((max_objs), dtype=torch.uint8)
+            num_objs = min(task_boxes[idx].shape[0], max_objs)
+            for k in range(num_objs):
+                cls_id = task_classes[idx][k] - 1
+                length = task_boxes[idx][k][3]
+                width = task_boxes[idx][k][4]
+                length = length / voxel_size[0] / self.train_cfg[
+                    'out_size_factor']
+                width = width / voxel_size[1] / self.train_cfg[
+                    'out_size_factor']
+                if width > 0 and length > 0:
+                    radius = gaussian_radius(
+                        (width, length),
+                        min_overlap=self.train_cfg['gaussian_overlap'])
+                    radius = max(self.train_cfg['min_radius'], int(radius))
+                    # be really careful for the coordinate system of
+                    # your box annotation.
+                    x, y, z = task_boxes[idx][k][0], task_boxes[idx][k][
+                        1], task_boxes[idx][k][2]
+                    coor_x = (
+                        x - pc_range[0]
+                    ) / voxel_size[0] / self.train_cfg['out_size_factor']
+                    coor_y = (
+                        y - pc_range[1]
+                    ) / voxel_size[1] / self.train_cfg['out_size_factor']
+                    center = torch.tensor([coor_x, coor_y],
+                                          dtype=torch.float32,
+                                          device=device)
+                    center_int = center.to(torch.int32)
+                    # throw out not in range objects to avoid out of array
+                    # area when creating the heatmap
+                    if not (0 <= center_int[0] < feature_map_size[0]
+                            and 0 <= center_int[1] < feature_map_size[1]):
+                        continue
+                    draw_gaussian(heatmap[cls_id], center_int, radius)
+                    new_idx = k
+                    x, y = center_int[0], center_int[1]
+                    assert (y * feature_map_size[0] + x <
+                            feature_map_size[0] * feature_map_size[1])
+                    ind[new_idx] = y * feature_map_size[0] + x
+                    mask[new_idx] = 1
+                    # TODO: support other outdoor dataset
+                    rot = task_boxes[idx][k][6]
+                    box_dim = task_boxes[idx][k][3:6]
+                    if self.norm_bbox:
+                        box_dim = box_dim.log()
+                    anno_box[new_idx] = torch.cat([
+                        center - torch.tensor([x, y], device=device),
+                        z.unsqueeze(0), box_dim,
+                        torch.cos(rot).unsqueeze(0),
+                        torch.sin(rot).unsqueeze(0)
+                    ])
+            heatmaps.append(heatmap)
+            anno_boxes.append(anno_box)
+            masks.append(mask)
+            inds.append(ind)
+        return heatmaps, anno_boxes, inds, masks, task_boxes
    def loss_by_feat(self, preds_dicts: Tuple[List[dict]],
                     batch_gt_instances_3d: List[InstanceData], *args,
@@ -79,13 +402,72 @@ class DSVTCenterHead(CenterHead):
                tasks head, and the internal list indicate different
                FPN level.
            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
-                gt_instances. It usually includes ``bboxes_3d`` and\
+                gt_instances_3d. It usually includes ``bboxes_3d`` and
                ``labels_3d`` attributes.
        Returns:
            dict[str,torch.Tensor]: Loss of heatmap and bbox of each task.
        """
-        pass
+        heatmaps, anno_boxes, inds, masks, task_gt_bboxes = self.get_targets(
+            batch_gt_instances_3d)
+        loss_dict = dict()
+        for task_id, preds_dict in enumerate(preds_dicts):
+            # heatmap focal loss
+            preds_dict[0]['heatmap'] = clip_sigmoid(preds_dict[0]['heatmap'])
+            num_pos = heatmaps[task_id].eq(1).float().sum().item()
+            loss_heatmap = self.loss_cls(
+                preds_dict[0]['heatmap'],
+                heatmaps[task_id],
+                avg_factor=max(num_pos, 1))
+            target_box = anno_boxes[task_id]
+            # reconstruct the anno_box from multiple reg heads
+            preds_dict[0]['anno_box'] = torch.cat(
+                (preds_dict[0]['reg'], preds_dict[0]['height'],
+                 preds_dict[0]['dim'], preds_dict[0]['rot']),
+                dim=1)
+            # Regression loss for dimension, offset, height, rotation
+            ind = inds[task_id]
+            num = masks[task_id].float().sum()
+            pred = preds_dict[0]['anno_box'].permute(0, 2, 3, 1).contiguous()
+            pred = pred.view(pred.size(0), -1, pred.size(3))
+            pred = self._gather_feat(pred, ind)
+            mask = masks[task_id].unsqueeze(2).expand_as(target_box).float()
+            isnotnan = (~torch.isnan(target_box)).float()
+            mask *= isnotnan
+            code_weights = self.train_cfg.get('code_weights', None)
+            bbox_weights = mask * mask.new_tensor(code_weights)
+            loss_bbox = self.loss_bbox(
+                pred, target_box, bbox_weights, avg_factor=(num + 1e-4))
+            loss_dict[f'task{task_id}.loss_heatmap'] = loss_heatmap
+            loss_dict[f'task{task_id}.loss_bbox'] = loss_bbox
+            if 'iou' in preds_dict[0]:
+                batch_box_preds = self._decode_all_preds(
+                    pred_dict=preds_dict[0],
+                    point_cloud_range=self.train_cfg['point_cloud_range'],
+                    voxel_size=self.train_cfg['voxel_size']
+                )  # (B, H, W, 7 or 9)
+                batch_box_preds_for_iou = batch_box_preds.permute(
+                    0, 3, 1, 2)  # (B, 7 or 9, H, W)
+                loss_dict[f'task{task_id}.loss_iou'] = self.calc_iou_loss(
+                    iou_preds=preds_dict[0]['iou'],
+                    batch_box_preds=batch_box_preds_for_iou.clone().detach(),
+                    mask=masks[task_id],
+                    ind=ind,
+                    gt_boxes=task_gt_bboxes[task_id])
+                if self.loss_iou_reg is not None:
+                    loss_dict[f'task{task_id}.loss_reg_iou'] = \
+                        self.calc_iou_reg_loss(
+                            batch_box_preds=batch_box_preds_for_iou,
+                            mask=masks[task_id],
+                            ind=ind,
+                            gt_boxes=task_gt_bboxes[task_id])
+        return loss_dict
    def predict(self,
                pts_feats: Tuple[torch.Tensor],
@@ -158,6 +540,7 @@ class DSVTCenterHead(CenterHead):
            else:
                batch_dim = preds_dict[0]['dim']
+            # It's different from CenterHead
            batch_rotc = preds_dict[0]['rot'][:, 0].unsqueeze(1)
            batch_rots = preds_dict[0]['rot'][:, 1].unsqueeze(1)
            batch_iou = (preds_dict[0]['iou'] +

--- a/projects/DSVT/dsvt/dynamic_pillar_vfe.py
+++ b/projects/DSVT/dsvt/dynamic_pillar_vfe.py
 # modified from https://github.com/Haiyang-W/DSVT
+import numpy as np
 import torch
 import torch.nn as nn
 import torch_scatter
@@ -76,6 +77,7 @@ class DynamicPillarVFE3D(nn.Module):
        self.voxel_x = voxel_size[0]
        self.voxel_y = voxel_size[1]
        self.voxel_z = voxel_size[2]
+        point_cloud_range = np.array(point_cloud_range).astype(np.float32)
        self.x_offset = self.voxel_x / 2 + point_cloud_range[0]
        self.y_offset = self.voxel_y / 2 + point_cloud_range[1]
        self.z_offset = self.voxel_z / 2 + point_cloud_range[2]

--- a/projects/DSVT/dsvt/res_second.py
+++ b/projects/DSVT/dsvt/res_second.py
 # modified from https://github.com/Haiyang-W/DSVT
+from typing import Sequence, Tuple
-import warnings
-from typing import Optional, Sequence, Tuple
 from mmengine.model import BaseModule
 from torch import Tensor
@@ -78,8 +76,8 @@ class ResSECOND(BaseModule):
        out_channels (list[int]): Output channels for multi-scale feature maps.
        blocks_nums (list[int]): Number of blocks in each stage.
        layer_strides (list[int]): Strides of each stage.
-        norm_cfg (dict): Config dict of normalization layers.
+        init_cfg (dict, optional): Config for weight initialization.
-        conv_cfg (dict): Config dict of convolutional layers.
+            Defaults to None.
    """
    def __init__(self,
@@ -87,8 +85,7 @@ class ResSECOND(BaseModule):
                 out_channels: Sequence[int] = [128, 128, 256],
                 blocks_nums: Sequence[int] = [1, 2, 2],
                 layer_strides: Sequence[int] = [2, 2, 2],
-                 init_cfg: OptMultiConfig = None,
+                 init_cfg: OptMultiConfig = None) -> None:
-                 pretrained: Optional[str] = None) -> None:
        super(ResSECOND, self).__init__(init_cfg=init_cfg)
        assert len(layer_strides) == len(blocks_nums)
        assert len(out_channels) == len(blocks_nums)
@@ -108,14 +105,6 @@ class ResSECOND(BaseModule):
                    BasicResBlock(out_channels[i], out_channels[i]))
            blocks.append(nn.Sequential(*cur_layers))
        self.blocks = nn.Sequential(*blocks)
-        assert not (init_cfg and pretrained), \
-            'init_cfg and pretrained cannot be setting at the same time'
-        if isinstance(pretrained, str):
-            warnings.warn('DeprecationWarning: pretrained is a deprecated, '
-                          'please use "init_cfg" instead')
-            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
-        else:
-            self.init_cfg = dict(type='Kaiming', layer='Conv2d')
    def forward(self, x: Tensor) -> Tuple[Tensor, ...]:
        """Forward function.

--- a/projects/DSVT/dsvt/transforms_3d.py
+++ b/projects/DSVT/dsvt/transforms_3d.py
+from typing import List
+import numpy as np
+from mmcv import BaseTransform
+from mmdet3d.registry import TRANSFORMS
+@TRANSFORMS.register_module()
+class ObjectRangeFilter3D(BaseTransform):
+    """Filter objects by the range. It differs from `ObjectRangeFilter` by
+    using `in_range_3d` instead of `in_range_bev`.
+    Required Keys:
+    - gt_bboxes_3d
+    Modified Keys:
+    - gt_bboxes_3d
+    Args:
+        point_cloud_range (list[float]): Point cloud range.
+    """
+    def __init__(self, point_cloud_range: List[float]) -> None:
+        self.pcd_range = np.array(point_cloud_range, dtype=np.float32)
+    def transform(self, input_dict: dict) -> dict:
+        """Transform function to filter objects by the range.
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Results after filtering, 'gt_bboxes_3d', 'gt_labels_3d'
+            keys are updated in the result dict.
+        """
+        gt_bboxes_3d = input_dict['gt_bboxes_3d']
+        gt_labels_3d = input_dict['gt_labels_3d']
+        mask = gt_bboxes_3d.in_range_3d(self.pcd_range)
+        gt_bboxes_3d = gt_bboxes_3d[mask]
+        # mask is a torch tensor but gt_labels_3d is still numpy array
+        # using mask to index gt_labels_3d will cause bug when
+        # len(gt_labels_3d) == 1, where mask=1 will be interpreted
+        # as gt_labels_3d[1] and cause out of index error
+        gt_labels_3d = gt_labels_3d[mask.numpy().astype(bool)]
+        # limit rad to [-pi, pi]
+        gt_bboxes_3d.limit_yaw(offset=0.5, period=2 * np.pi)
+        input_dict['gt_bboxes_3d'] = gt_bboxes_3d
+        input_dict['gt_labels_3d'] = gt_labels_3d
+        return input_dict
+    def __repr__(self) -> str:
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(point_cloud_range={self.pcd_range.tolist()})'
+        return repr_str
+@TRANSFORMS.register_module()
+class PointsRangeFilter3D(BaseTransform):
+    """Filter points by the range. It differs from `PointRangeFilter` by using
+    `in_range_bev` instead of `in_range_3d`.
+    Required Keys:
+    - points
+    - pts_instance_mask (optional)
+    Modified Keys:
+    - points
+    - pts_instance_mask (optional)
+    Args:
+        point_cloud_range (list[float]): Point cloud range.
+    """
+    def __init__(self, point_cloud_range: List[float]) -> None:
+        self.pcd_range = np.array(point_cloud_range, dtype=np.float32)
+    def transform(self, input_dict: dict) -> dict:
+        """Transform function to filter points by the range.
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Results after filtering, 'points', 'pts_instance_mask'
+            and 'pts_semantic_mask' keys are updated in the result dict.
+        """
+        points = input_dict['points']
+        points_mask = points.in_range_bev(self.pcd_range[[0, 1, 3, 4]])
+        clean_points = points[points_mask]
+        input_dict['points'] = clean_points
+        points_mask = points_mask.numpy()
+        pts_instance_mask = input_dict.get('pts_instance_mask', None)
+        pts_semantic_mask = input_dict.get('pts_semantic_mask', None)
+        if pts_instance_mask is not None:
+            input_dict['pts_instance_mask'] = pts_instance_mask[points_mask]
+        if pts_semantic_mask is not None:
+            input_dict['pts_semantic_mask'] = pts_semantic_mask[points_mask]
+        return input_dict
+    def __repr__(self) -> str:
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(point_cloud_range={self.pcd_range.tolist()})'
+        return repr_str
--- a/projects/DSVT/dsvt/utils.py
+++ b/projects/DSVT/dsvt/utils.py
@@ -3,10 +3,11 @@ from typing import Dict, List, Optional
 import numpy as np
 import torch
 import torch.nn as nn
+from mmdet.models.losses.utils import weighted_loss
 from torch import Tensor
 from mmdet3d.models.task_modules import CenterPointBBoxCoder
-from mmdet3d.registry import TASK_UTILS
+from mmdet3d.registry import MODELS, TASK_UTILS
 from .ops.ingroup_inds.ingroup_inds_op import ingroup_inds
 get_inner_win_inds_cuda = ingroup_inds
@@ -266,7 +267,7 @@ class DSVTBBoxCoder(CenterPointBBoxCoder):
            thresh_mask = final_scores > self.score_threshold
        if self.post_center_range is not None:
-            self.post_center_range = torch.tensor(
+            self.post_center_range = torch.as_tensor(
                self.post_center_range, device=heat.device)
            mask = (final_box_preds[..., :3] >=
                    self.post_center_range[:3]).all(2)
@@ -298,3 +299,142 @@ class DSVTBBoxCoder(CenterPointBBoxCoder):
                'support post_center_range is not None for now!')
        return predictions_dicts
+def center_to_corner2d(center, dim):
+    corners_norm = torch.tensor(
+        [[-0.5, -0.5], [-0.5, 0.5], [0.5, 0.5], [0.5, -0.5]],
+        device=dim.device).type_as(center)  # (4, 2)
+    corners = dim.view([-1, 1, 2]) * corners_norm.view([1, 4, 2])  # (N, 4, 2)
+    corners = corners + center.view(-1, 1, 2)
+    return corners
+@weighted_loss
+def diou3d_loss(pred_boxes, gt_boxes, eps: float = 1e-7):
+    """
+    modified from https://github.com/agent-sgs/PillarNet/blob/master/det3d/core/utils/center_utils.py # noqa
+    Args:
+        pred_boxes (N, 7):
+        gt_boxes (N, 7):
+    Returns:
+        Tensor: Distance-IoU Loss.
+    """
+    assert pred_boxes.shape[0] == gt_boxes.shape[0]
+    qcorners = center_to_corner2d(pred_boxes[:, :2],
+                                  pred_boxes[:, 3:5])  # (N, 4, 2)
+    gcorners = center_to_corner2d(gt_boxes[:, :2], gt_boxes[:,
+                                                            3:5])  # (N, 4, 2)
+    inter_max_xy = torch.minimum(qcorners[:, 2], gcorners[:, 2])
+    inter_min_xy = torch.maximum(qcorners[:, 0], gcorners[:, 0])
+    out_max_xy = torch.maximum(qcorners[:, 2], gcorners[:, 2])
+    out_min_xy = torch.minimum(qcorners[:, 0], gcorners[:, 0])
+    # calculate area
+    volume_pred_boxes = pred_boxes[:, 3] * pred_boxes[:, 4] * pred_boxes[:, 5]
+    volume_gt_boxes = gt_boxes[:, 3] * gt_boxes[:, 4] * gt_boxes[:, 5]
+    inter_h = torch.minimum(
+        pred_boxes[:, 2] + 0.5 * pred_boxes[:, 5],
+        gt_boxes[:, 2] + 0.5 * gt_boxes[:, 5]) - torch.maximum(
+            pred_boxes[:, 2] - 0.5 * pred_boxes[:, 5],
+            gt_boxes[:, 2] - 0.5 * gt_boxes[:, 5])
+    inter_h = torch.clamp(inter_h, min=0)
+    inter = torch.clamp((inter_max_xy - inter_min_xy), min=0)
+    volume_inter = inter[:, 0] * inter[:, 1] * inter_h
+    volume_union = volume_gt_boxes + volume_pred_boxes - volume_inter + eps
+    # boxes_iou3d_gpu(pred_boxes, gt_boxes)
+    inter_diag = torch.pow(gt_boxes[:, 0:3] - pred_boxes[:, 0:3], 2).sum(-1)
+    outer_h = torch.maximum(
+        gt_boxes[:, 2] + 0.5 * gt_boxes[:, 5],
+        pred_boxes[:, 2] + 0.5 * pred_boxes[:, 5]) - torch.minimum(
+            gt_boxes[:, 2] - 0.5 * gt_boxes[:, 5],
+            pred_boxes[:, 2] - 0.5 * pred_boxes[:, 5])
+    outer_h = torch.clamp(outer_h, min=0)
+    outer = torch.clamp((out_max_xy - out_min_xy), min=0)
+    outer_diag = outer[:, 0]**2 + outer[:, 1]**2 + outer_h**2 + eps
+    dious = volume_inter / volume_union - inter_diag / outer_diag
+    dious = torch.clamp(dious, min=-1.0, max=1.0)
+    loss = 1 - dious
+    return loss
+@MODELS.register_module()
+class DIoU3DLoss(nn.Module):
+    r"""3D bboxes Implementation of `Distance-IoU Loss: Faster and Better
+    Learning for Bounding Box Regression <https://arxiv.org/abs/1911.08287>`_.
+    Code is modified from https://github.com/Zzh-tju/DIoU.
+    Args:
+        eps (float): Epsilon to avoid log(0). Defaults to 1e-6.
+        reduction (str): Options are "none", "mean" and "sum".
+            Defaults to "mean".
+        loss_weight (float): Weight of loss. Defaults to 1.0.
+    """
+    def __init__(self,
+                 eps: float = 1e-6,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0) -> None:
+        super().__init__()
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+    def forward(self,
+                pred: Tensor,
+                target: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[int] = None,
+                reduction_override: Optional[str] = None,
+                **kwargs) -> Tensor:
+        """Forward function.
+        Args:
+            pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+                shape (n, 4).
+            target (Tensor): The learning target of the prediction,
+                shape (n, 4).
+            weight (Optional[Tensor], optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (Optional[int], optional): Average factor that is used
+                to average the loss. Defaults to None.
+            reduction_override (Optional[str], optional): The reduction method
+                used to override the original reduction method of the loss.
+                Defaults to None. Options are "none", "mean" and "sum".
+        Returns:
+            Tensor: Loss tensor.
+        """
+        if weight is not None and not torch.any(weight > 0):
+            if pred.dim() == weight.dim() + 1:
+                weight = weight.unsqueeze(1)
+            return (pred * weight).sum()  # 0
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if weight is not None and weight.dim() > 1:
+            # TODO: remove this in the future
+            # reduce the weight of shape (n, 4) to (n,) to match the
+            # giou_loss of shape (n,)
+            assert weight.shape == pred.shape
+            weight = weight.mean(-1)
+        loss = self.loss_weight * diou3d_loss(
+            pred,
+            target,
+            weight,
+            eps=self.eps,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss
--- a/projects/NeRF-Det/README.md
+++ b/projects/NeRF-Det/README.md
+# NeRF-Det: Learning Geometry-Aware Volumetric Representation for Multi-View 3D Object Detection
+> [NeRF-Det: Learning Geometry-Aware Volumetric Representation for Multi-View 3D Object Detection](https://arxiv.org/abs/2307.14620)
+<!-- [ALGORITHM] -->
+## Abstract
+NeRF-Det is a novel method for indoor 3D detection with posed RGB images as input. Unlike existing indoor 3D detection methods that struggle to model scene geometry, NeRF-Det makes novel use of NeRF in an end-to-end manner to explicitly estimate 3D geometry, thereby improving 3D detection performance. Specifically, to avoid the significant extra latency associated with per-scene optimization of NeRF, NeRF-Det introduce sufficient geometry priors to enhance the generalizability of NeRF-MLP. Furthermore, it subtly connect the detection and NeRF branches through a shared MLP, enabling an efficient adaptation of NeRF to detection and yielding geometry-aware volumetric representations for 3D detection. NeRF-Det outperforms state-of-the-arts by 3.9 mAP and 3.1 mAP on the ScanNet and ARKITScenes benchmarks, respectively. The author provide extensive analysis to shed light on how NeRF-Det works. As a result of joint-training design,  NeRF-Det is able to generalize well to unseen scenes for object detection, view synthesis, and depth estimation tasks without requiring per-scene optimization. Code will be available at https://github.com/facebookresearch/NeRF-Det
+<div align=center>
+<img src="https://chenfengxu714.github.io/nerfdet/static/images/method-cropped_1.png" width="800"/>
+</div>
+## Introduction
+This directory contains the implementations of NeRF-Det (https://arxiv.org/abs/2307.14620). Our implementations are built on top of MMdetection3D. We have updated NeRF-Det to be compatible with latest mmdet3d version. The codebase and config files have all changed to adapt to the new mmdet3d version. All previous pretrained models are verified with the result listed below. However, newly trained models are yet to be uploaded.
+<!-- Share any information you would like others to know. For example:
+Author: @xxx.
+This is an implementation of \[XXX\]. -->
+## Dataset
+The format of the scannet dataset in the latest version of mmdet3d only supports the lidar tasks. For NeRF-Det, we need to create the new format of ScanNet Dataset.
+Please following the files in mmdet3d to prepare the raw data of ScanNet. After that, please use this command to generate the pkls used in nerfdet.
+```bash
+python projects/NeRF-Det/prepare_infos.py --root-path ./data/scannet --out-dir ./data/scannet
+```
+The new format of the pkl is organized as below:
+- scannet_infos_train.pkl: The train data infos, the detailed info of each scan is as follows:
+  - info\['instances'\]:A list of dict contains all annotations, each dict contains all annotation information of single instance.For the i-th instance:
+    - info\['instances'\]\[i\]\['bbox_3d'\]: List of 6 numbers representing the axis_aligned in depth coordinate system, in (x,y,z,l,w,h) order.
+    - info\['instances'\]\[i\]\['bbox_label_3d'\]: The label of each 3d bounding boxes.
+  - info\['cam2img'\]: The intrinsic matrix.Every scene has one matrix.
+  - info\['lidar2cam'\]: The extrinsic matrixes.Every scene has 300 matrixes.
+  - info\['img_paths'\]: The paths of the 300 rgb pictures.
+  - info\['axis_align_matrix'\]: The align matrix.Every scene has one matrix.
+After preparing your scannet dataset pkls,please change the paths in configs to fit your project.
+## Train
+In MMDet3D's root directory, run the following command to train the model:
+```bash
+python tools/train.py projects/NeRF-Det/configs/nerfdet_res50_2x_low_res.py ${WORK_DIR}
+```
+## Results and Models
+### NeRF-Det
+|                            Backbone                             | mAP@25 | mAP@50 |    Log    |
+| :-------------------------------------------------------------: | :----: | :----: | :-------: |
+|      [NeRF-Det-R50](./configs/nerfdet_res50_2x_low_res.py)      |  53.0  |  26.8  | [log](<>) |
+|  [NeRF-Det-R50\*](./configs/nerfdet_res50_2x_low_res_depth.py)  |  52.2  |  28.5  | [log](<>) |
+| [NeRF-Det-R101\*](./configs/nerfdet_res101_2x_low_res_depth.py) |  52.3  |  28.5  | [log](<>) |
+(Here NeRF-Det-R50\* means this model uses depth information in the training step)
+### Notes
+- The values showed in the chart all represents the best mAP in the training.
+- Since there is a lot of randomness in the behavior of the model, we conducted three experiments on each config and took the average. The mAP showed on the above chart are all average values.
+- We also conducted the same experiments in the original code, the results are showed below.
+  |    Backbone     | mAP@25 | mAP@50 |
+  | :-------------: | :----: | :----: |
+  |  NeRF-Det-R50   |  52.8  |  26.8  |
+  | NeRF-Det-R50\*  |  52.4  |  27.5  |
+  | NeRF-Det-R101\* |  52.8  |  28.6  |
+- Attention: Because of the randomness in the construction of the ScanNet dataset itself and the behavior of the model, the training results will fluctuate considerably. According to experimental results and experience, the experimental results will fluctuate by plus or minus 1.5 points.
+## Evaluation using pretrained models
+1. Download the pretrained checkpoints through the linkings in the above chart.
+2. Testing
+   To test, use:
+   ```bash
+   python tools/test.py projects/NeRF-Det/configs/nerfdet_res50_2x_low_res.py ${CHECKPOINT_PATH}
+   ```
+## Citation
+<!-- You may remove this section if not applicable. -->
+```latex
+@inproceedings{
+  xu2023nerfdet,
+  title={NeRF-Det: Learning Geometry-Aware Volumetric Representation for Multi-View 3D Object Detection},
+  author={Xu, Chenfeng and Wu, Bichen and Hou, Ji and Tsai, Sam and Li, Ruilong and Wang, Jialiang and Zhan, Wei and He, Zijian and Vajda, Peter and Keutzer, Kurt and Tomizuka, Masayoshi},
+  booktitle={ICCV},
+  year={2023},
+}
+@inproceedings{
+park2023time,
+title={Time Will Tell: New Outlooks and A Baseline for Temporal Multi-View 3D Object Detection},
+author={Jinhyung Park and Chenfeng Xu and Shijia Yang and Kurt Keutzer and Kris M. Kitani and Masayoshi Tomizuka and Wei Zhan},
+booktitle={The Eleventh International Conference on Learning Representations },
+year={2023},
+url={https://openreview.net/forum?id=H3HcEJA2Um}
+}
+```
--- a/projects/NeRF-Det/configs/nerfdet_res101_2x_low_res_depth.py
+++ b/projects/NeRF-Det/configs/nerfdet_res101_2x_low_res_depth.py
+_base_ = ['../../../configs/_base_/default_runtime.py']
+custom_imports = dict(imports=['projects.NeRF-Det.nerfdet'])
+prior_generator = dict(
+    type='AlignedAnchor3DRangeGenerator',
+    ranges=[[-3.2, -3.2, -1.28, 3.2, 3.2, 1.28]],
+    rotations=[.0])
+model = dict(
+    type='NerfDet',
+    data_preprocessor=dict(
+        type='NeRFDetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=10),
+    backbone=dict(
+        type='mmdet.ResNet',
+        depth=101,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'),
+        style='pytorch'),
+    neck=dict(
+        type='mmdet.FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=4),
+    neck_3d=dict(
+        type='IndoorImVoxelNeck',
+        in_channels=256,
+        out_channels=128,
+        n_blocks=[1, 1, 1]),
+    bbox_head=dict(
+        type='NerfDetHead',
+        bbox_loss=dict(type='AxisAlignedIoULoss', loss_weight=1.0),
+        n_classes=18,
+        n_levels=3,
+        n_channels=128,
+        n_reg_outs=6,
+        pts_assign_threshold=27,
+        pts_center_threshold=18,
+        prior_generator=prior_generator),
+    prior_generator=prior_generator,
+    voxel_size=[.16, .16, .2],
+    n_voxels=[40, 40, 16],
+    aabb=([-2.7, -2.7, -0.78], [3.7, 3.7, 1.78]),
+    near_far_range=[0.2, 8.0],
+    N_samples=64,
+    N_rand=2048,
+    nerf_mode='image',
+    depth_supervise=True,
+    use_nerf_mask=True,
+    nerf_sample_view=20,
+    squeeze_scale=4,
+    nerf_density=True,
+    train_cfg=dict(),
+    test_cfg=dict(nms_pre=1000, iou_thr=.25, score_thr=.01))
+dataset_type = 'MultiViewScanNetDataset'
+data_root = 'data/scannet/'
+class_names = [
+    'cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window', 'bookshelf',
+    'picture', 'counter', 'desk', 'curtain', 'refrigerator', 'showercurtrain',
+    'toilet', 'sink', 'bathtub', 'garbagebin'
+]
+metainfo = dict(CLASSES=class_names)
+file_client_args = dict(backend='disk')
+input_modality = dict(
+    use_camera=True,
+    use_depth=True,
+    use_lidar=False,
+    use_neuralrecon_depth=False,
+    use_ray=True)
+backend_args = None
+train_collect_keys = [
+    'img', 'gt_bboxes_3d', 'gt_labels_3d', 'depth', 'lightpos', 'nerf_sizes',
+    'raydirs', 'gt_images', 'gt_depths', 'denorm_images'
+]
+test_collect_keys = [
+    'img',
+    'depth',
+    'lightpos',
+    'nerf_sizes',
+    'raydirs',
+    'gt_images',
+    'gt_depths',
+    'denorm_images',
+]
+train_pipeline = [
+    dict(type='LoadAnnotations3D'),
+    dict(
+        type='MultiViewPipeline',
+        n_images=48,
+        transforms=[
+            dict(type='LoadImageFromFile', file_client_args=file_client_args),
+            dict(type='Resize', scale=(320, 240), keep_ratio=True),
+        ],
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        margin=10,
+        depth_range=[0.5, 5.5],
+        loading='random',
+        nerf_target_views=10),
+    dict(type='RandomShiftOrigin', std=(.7, .7, .0)),
+    dict(type='PackNeRFDetInputs', keys=train_collect_keys)
+]
+test_pipeline = [
+    dict(type='LoadAnnotations3D'),
+    dict(
+        type='MultiViewPipeline',
+        n_images=101,
+        transforms=[
+            dict(type='LoadImageFromFile', file_client_args=file_client_args),
+            dict(type='Resize', scale=(320, 240), keep_ratio=True),
+        ],
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        margin=10,
+        depth_range=[0.5, 5.5],
+        loading='random',
+        nerf_target_views=1),
+    dict(type='PackNeRFDetInputs', keys=test_collect_keys)
+]
+train_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=6,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='scannet_infos_train_new.pkl',
+            pipeline=train_pipeline,
+            modality=input_modality,
+            test_mode=False,
+            filter_empty_gt=True,
+            box_type_3d='Depth',
+            metainfo=metainfo)))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=5,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='scannet_infos_val_new.pkl',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        filter_empty_gt=True,
+        box_type_3d='Depth',
+        metainfo=metainfo))
+test_dataloader = val_dataloader
+val_evaluator = dict(type='IndoorMetric')
+test_evaluator = val_evaluator
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=1)
+test_cfg = dict()
+val_cfg = dict()
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.0001),
+    paramwise_cfg=dict(
+        custom_keys={'backbone': dict(lr_mult=0.1, decay_mult=1.0)}),
+    clip_grad=dict(max_norm=35., norm_type=2))
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=12,
+        by_epoch=True,
+        milestones=[8, 11],
+        gamma=0.1)
+]
+# hooks
+default_hooks = dict(
+    checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=12))
+# runtime
+find_unused_parameters = True  # only 1 of 4 FPN outputs is used
--- a/projects/NeRF-Det/configs/nerfdet_res50_2x_low_res.py
+++ b/projects/NeRF-Det/configs/nerfdet_res50_2x_low_res.py
+_base_ = ['./nerfdet_res50_2x_low_res_depth.py']
+model = dict(depth_supervise=False)
+dataset_type = 'MultiViewScanNetDataset'
+data_root = 'data/scannet/'
+class_names = [
+    'cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window', 'bookshelf',
+    'picture', 'counter', 'desk', 'curtain', 'refrigerator', 'showercurtrain',
+    'toilet', 'sink', 'bathtub', 'garbagebin'
+]
+metainfo = dict(CLASSES=class_names)
+file_client_args = dict(backend='disk')
+input_modality = dict(use_depth=False)
+backend_args = None
+train_collect_keys = [
+    'img', 'gt_bboxes_3d', 'gt_labels_3d', 'lightpos', 'nerf_sizes', 'raydirs',
+    'gt_images', 'gt_depths', 'denorm_images'
+]
+test_collect_keys = [
+    'img',
+    'lightpos',
+    'nerf_sizes',
+    'raydirs',
+    'gt_images',
+    'gt_depths',
+    'denorm_images',
+]
+train_pipeline = [
+    dict(type='LoadAnnotations3D'),
+    dict(
+        type='MultiViewPipeline',
+        n_images=50,
+        transforms=[
+            dict(type='LoadImageFromFile', file_client_args=file_client_args),
+            dict(type='Resize', scale=(320, 240), keep_ratio=True),
+        ],
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        margin=10,
+        depth_range=[0.5, 5.5],
+        loading='random',
+        nerf_target_views=10),
+    dict(type='RandomShiftOrigin', std=(.7, .7, .0)),
+    dict(type='PackNeRFDetInputs', keys=train_collect_keys)
+]
+test_pipeline = [
+    dict(type='LoadAnnotations3D'),
+    dict(
+        type='MultiViewPipeline',
+        n_images=101,
+        transforms=[
+            dict(type='LoadImageFromFile', file_client_args=file_client_args),
+            dict(type='Resize', scale=(320, 240), keep_ratio=True),
+        ],
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        margin=10,
+        depth_range=[0.5, 5.5],
+        loading='random',
+        nerf_target_views=1),
+    dict(type='PackNeRFDetInputs', keys=test_collect_keys)
+]
+train_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=6,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='scannet_infos_train_new.pkl',
+            pipeline=train_pipeline,
+            modality=input_modality,
+            test_mode=False,
+            filter_empty_gt=True,
+            box_type_3d='Depth',
+            metainfo=metainfo)))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='scannet_infos_val_new.pkl',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        filter_empty_gt=True,
+        box_type_3d='Depth',
+        metainfo=metainfo))
+test_dataloader = val_dataloader
--- a/projects/NeRF-Det/configs/nerfdet_res50_2x_low_res_depth.py
+++ b/projects/NeRF-Det/configs/nerfdet_res50_2x_low_res_depth.py
+_base_ = ['../../../configs/_base_/default_runtime.py']
+custom_imports = dict(imports=['projects.NeRF-Det.nerfdet'])
+prior_generator = dict(
+    type='AlignedAnchor3DRangeGenerator',
+    ranges=[[-3.2, -3.2, -1.28, 3.2, 3.2, 1.28]],
+    rotations=[.0])
+model = dict(
+    type='NerfDet',
+    data_preprocessor=dict(
+        type='NeRFDetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=10),
+    backbone=dict(
+        type='mmdet.ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
+        style='pytorch'),
+    neck=dict(
+        type='mmdet.FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=4),
+    neck_3d=dict(
+        type='IndoorImVoxelNeck',
+        in_channels=256,
+        out_channels=128,
+        n_blocks=[1, 1, 1]),
+    bbox_head=dict(
+        type='NerfDetHead',
+        bbox_loss=dict(type='AxisAlignedIoULoss', loss_weight=1.0),
+        n_classes=18,
+        n_levels=3,
+        n_channels=128,
+        n_reg_outs=6,
+        pts_assign_threshold=27,
+        pts_center_threshold=18,
+        prior_generator=prior_generator),
+    prior_generator=prior_generator,
+    voxel_size=[.16, .16, .2],
+    n_voxels=[40, 40, 16],
+    aabb=([-2.7, -2.7, -0.78], [3.7, 3.7, 1.78]),
+    near_far_range=[0.2, 8.0],
+    N_samples=64,
+    N_rand=2048,
+    nerf_mode='image',
+    depth_supervise=True,
+    use_nerf_mask=True,
+    nerf_sample_view=20,
+    squeeze_scale=4,
+    nerf_density=True,
+    train_cfg=dict(),
+    test_cfg=dict(nms_pre=1000, iou_thr=.25, score_thr=.01))
+dataset_type = 'MultiViewScanNetDataset'
+data_root = 'data/scannet/'
+class_names = [
+    'cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window', 'bookshelf',
+    'picture', 'counter', 'desk', 'curtain', 'refrigerator', 'showercurtrain',
+    'toilet', 'sink', 'bathtub', 'garbagebin'
+]
+metainfo = dict(CLASSES=class_names)
+file_client_args = dict(backend='disk')
+input_modality = dict(
+    use_camera=True,
+    use_depth=True,
+    use_lidar=False,
+    use_neuralrecon_depth=False,
+    use_ray=True)
+backend_args = None
+train_collect_keys = [
+    'img', 'gt_bboxes_3d', 'gt_labels_3d', 'depth', 'lightpos', 'nerf_sizes',
+    'raydirs', 'gt_images', 'gt_depths', 'denorm_images'
+]
+test_collect_keys = [
+    'img',
+    'depth',
+    'lightpos',
+    'nerf_sizes',
+    'raydirs',
+    'gt_images',
+    'gt_depths',
+    'denorm_images',
+]
+train_pipeline = [
+    dict(type='LoadAnnotations3D'),
+    dict(
+        type='MultiViewPipeline',
+        n_images=50,
+        transforms=[
+            dict(type='LoadImageFromFile', file_client_args=file_client_args),
+            dict(type='Resize', scale=(320, 240), keep_ratio=True),
+        ],
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        margin=10,
+        depth_range=[0.5, 5.5],
+        loading='random',
+        nerf_target_views=10),
+    dict(type='RandomShiftOrigin', std=(.7, .7, .0)),
+    dict(type='PackNeRFDetInputs', keys=train_collect_keys)
+]
+test_pipeline = [
+    dict(type='LoadAnnotations3D'),
+    dict(
+        type='MultiViewPipeline',
+        n_images=101,
+        transforms=[
+            dict(type='LoadImageFromFile', file_client_args=file_client_args),
+            dict(type='Resize', scale=(320, 240), keep_ratio=True),
+        ],
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        margin=10,
+        depth_range=[0.5, 5.5],
+        loading='random',
+        nerf_target_views=1),
+    dict(type='PackNeRFDetInputs', keys=test_collect_keys)
+]
+train_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=6,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='scannet_infos_train_new.pkl',
+            pipeline=train_pipeline,
+            modality=input_modality,
+            test_mode=False,
+            filter_empty_gt=True,
+            box_type_3d='Depth',
+            metainfo=metainfo)))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=5,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='scannet_infos_val_new.pkl',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        filter_empty_gt=True,
+        box_type_3d='Depth',
+        metainfo=metainfo))
+test_dataloader = val_dataloader
+val_evaluator = dict(type='IndoorMetric')
+test_evaluator = val_evaluator
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=1)
+test_cfg = dict()
+val_cfg = dict()
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.0001),
+    paramwise_cfg=dict(
+        custom_keys={'backbone': dict(lr_mult=0.1, decay_mult=1.0)}),
+    clip_grad=dict(max_norm=35., norm_type=2))
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=12,
+        by_epoch=True,
+        milestones=[8, 11],
+        gamma=0.1)
+]
+# hooks
+default_hooks = dict(
+    checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=12))
+# runtime
+find_unused_parameters = True  # only 1 of 4 FPN outputs is used
--- a/projects/NeRF-Det/nerfdet/__init__.py
+++ b/projects/NeRF-Det/nerfdet/__init__.py
+from .data_preprocessor import NeRFDetDataPreprocessor
+from .formating import PackNeRFDetInputs
+from .multiview_pipeline import MultiViewPipeline, RandomShiftOrigin
+from .nerfdet import NerfDet
+from .nerfdet_head import NerfDetHead
+from .scannet_multiview_dataset import MultiViewScanNetDataset
+__all__ = [
+    'MultiViewScanNetDataset', 'MultiViewPipeline', 'RandomShiftOrigin',
+    'PackNeRFDetInputs', 'NeRFDetDataPreprocessor', 'NerfDetHead', 'NerfDet'
+]
--- a/projects/NeRF-Det/nerfdet/data_preprocessor.py
+++ b/projects/NeRF-Det/nerfdet/data_preprocessor.py
--- a/projects/NeRF-Det/nerfdet/formating.py
+++ b/projects/NeRF-Det/nerfdet/formating.py