Merge pull request #2867 from open-mmlab/dev-1.x

Bump version to 1.4.0

Merge pull request #2867 from open-mmlab/dev-1.x
Bump version to 1.4.0
fe25f7a5 · Wenwei Zhang · GitHub · 5c0613be · 0ef13b83 · fe25f7a5
Unverified Commit fe25f7a5 authored Jan 08, 2024 by Wenwei Zhang Committed by GitHub Jan 08, 2024
20 changed files
--- a/mmdet3d/structures/bbox_3d/base_box3d.py
+++ b/mmdet3d/structures/bbox_3d/base_box3d.py
@@ -275,12 +275,13 @@ class BaseInstance3DBoxes:
            Tensor: A binary vector indicating whether each point is inside the
            reference range.
        """
-        in_range_flags = ((self.tensor[:, 0] > box_range[0])
-                          & (self.tensor[:, 1] > box_range[1])
-                          & (self.tensor[:, 2] > box_range[2])
-                          & (self.tensor[:, 0] < box_range[3])
-                          & (self.tensor[:, 1] < box_range[4])
-                          & (self.tensor[:, 2] < box_range[5]))
+        gravity_center = self.gravity_center
+        in_range_flags = ((gravity_center[:, 0] > box_range[0])
+                          & (gravity_center[:, 1] > box_range[1])
+                          & (gravity_center[:, 2] > box_range[2])
+                          & (gravity_center[:, 0] < box_range[3])
+                          & (gravity_center[:, 1] < box_range[4])
+                          & (gravity_center[:, 2] < box_range[5]))
        return in_range_flags

    @abstractmethod

--- a/mmdet3d/version.py
+++ b/mmdet3d/version.py
 # Copyright (c) Open-MMLab. All rights reserved.

-__version__ = '1.3.0'
+__version__ = '1.4.0'
 short_version = __version__



--- a/projects/CenterFormer/configs/centerformer_voxel01_second-attn_secfpn-attn_4xb4-cyclic-20e_waymoD5-3d-3class.py
+++ b/projects/CenterFormer/configs/centerformer_voxel01_second-attn_secfpn-attn_4xb4-cyclic-20e_waymoD5-3d-3class.py
@@ -179,7 +179,10 @@ test_pipeline = [
            dict(
                type='PointsRangeFilter', point_cloud_range=point_cloud_range)
        ]),
-    dict(type='Pack3DDetInputs', keys=['points'])
+    dict(
+        type='Pack3DDetInputs',
+        keys=['points'],
+        meta_keys=['box_type_3d', 'sample_idx', 'context_name', 'timestamp'])
 ]

 dataset_type = 'WaymoDataset'
@@ -223,13 +226,7 @@ val_dataloader = dict(
 test_dataloader = val_dataloader

 val_evaluator = dict(
-    type='WaymoMetric',
-    ann_file='./data/waymo/kitti_format/waymo_infos_val.pkl',
-    waymo_bin_file='./data/waymo/waymo_format/gt.bin',
-    data_root='./data/waymo/waymo_format',
-    backend_args=backend_args,
-    convert_kitti_format=False,
-    idx2metainfo='./data/waymo/waymo_format/idx2metainfo.pkl')
+    type='WaymoMetric', waymo_bin_file='./data/waymo/waymo_format/gt.bin')
 test_evaluator = val_evaluator

 vis_backends = [dict(type='LocalVisBackend')]

--- a/projects/DSVT/README.md
+++ b/projects/DSVT/README.md
@@ -57,17 +57,25 @@ python tools/test.py projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-

 ### Training commands

-The support of training DSVT is on the way.
+In MMDetection3D's root directory, run the following command to test the model:
+
+```bash
+tools/dist_train.sh projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py 8 --sync_bn torch
+```

 ## Results and models

 ### Waymo

-|                                     Middle Encoder                                     |                                          Backbone                                           | Load Interval | Voxel type (voxel size) | Multi-Class NMS | Multi-frames | Mem (GB) | Inf time (fps) | mAP@L1 | mAPH@L1 | mAP@L2 | **mAPH@L2** | Download |
-| :------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------: | :-----------: | :---------------------: | :-------------: | :----------: | :------: | :------------: | :----: | :-----: | :----: | :---------: | :------: |
-| [DSVT](./configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py) | [ResSECOND](./configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py) |       5       |      voxel (0.32)       |        ✓        |      ×       |          |                |  75.2  |  72.2   |  68.9  |    66.1     |          |
+|                                     Middle Encoder                                     |                                          Backbone                                           | Load Interval | Voxel type (voxel size) | Multi-Class NMS | Multi-frames | mAP@L1 | mAPH@L1 | mAP@L2 | **mAPH@L2** |                                                                           Download                                                                           |
+| :------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------: | :-----------: | :---------------------: | :-------------: | :----------: | :----: | :-----: | :----: | :---------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [DSVT](./configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py) | [ResSECOND](./configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py) |       5       |      voxel (0.32)       |        ✓        |      ×       |  75.5  |  72.4   |  69.2  |    66.3     | [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/dsvt/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class_20230917_102130.log) |
+
+**Note**:
+
+- `ResSECOND` denotes the base block in SECOND has residual layers.

-**Note** that `ResSECOND` denotes the base block in SECOND has residual layers.
+- Regrettably, we are unable to provide the pre-trained model weights due to [Waymo Dataset License Agreement](https://waymo.com/open/terms/), so we only provide the training logs as shown above.

 ## Citation


--- a/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py
+++ b/projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py
@@ -88,25 +88,28 @@ model = dict(
        loss_cls=dict(
            type='mmdet.GaussianFocalLoss', reduction='mean', loss_weight=1.0),
        loss_bbox=dict(type='mmdet.L1Loss', reduction='mean', loss_weight=2.0),
+        loss_iou=dict(type='mmdet.L1Loss', reduction='sum', loss_weight=1.0),
+        loss_reg_iou=dict(
+            type='mmdet3d.DIoU3DLoss', reduction='mean', loss_weight=2.0),
        norm_bbox=True),
    # model training and testing settings
    train_cfg=dict(
-        pts=dict(
-            grid_size=grid_size,
-            voxel_size=voxel_size,
-            out_size_factor=4,
-            dense_reg=1,
-            gaussian_overlap=0.1,
-            max_objs=500,
-            min_radius=2,
-            code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])),
+        grid_size=grid_size,
+        voxel_size=voxel_size,
+        point_cloud_range=point_cloud_range,
+        out_size_factor=1,
+        dense_reg=1,
+        gaussian_overlap=0.1,
+        max_objs=500,
+        min_radius=2,
+        code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]),
    test_cfg=dict(
        max_per_img=500,
        max_pool_nms=False,
        min_radius=[4, 12, 10, 1, 0.85, 0.175],
        iou_rectifier=[[0.68, 0.71, 0.65]],
        pc_range=[-80, -80],
-        out_size_factor=4,
+        out_size_factor=1,
        voxel_size=voxel_size[:2],
        nms_type='rotate',
        multi_class_nms=True,
@@ -128,6 +131,8 @@ db_sampler = dict(
        coord_type='LIDAR',
        load_dim=6,
        use_dim=[0, 1, 2, 3, 4],
+        norm_intensity=True,
+        norm_elongation=True,
        backend_args=backend_args),
    backend_args=backend_args)

@@ -138,25 +143,22 @@ train_pipeline = [
        load_dim=6,
        use_dim=5,
        norm_intensity=True,
+        norm_elongation=True,
        backend_args=backend_args),
-    # Add this if using `MultiFrameDeformableDecoderRPN`
-    # dict(
-    #     type='LoadPointsFromMultiSweeps',
-    #     sweeps_num=9,
-    #     load_dim=6,
-    #     use_dim=[0, 1, 2, 3, 4],
-    #     pad_empty_sweeps=True,
-    #     remove_close=True),
    dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
    dict(type='ObjectSample', db_sampler=db_sampler),
+    dict(
+        type='RandomFlip3D',
+        sync_2d=False,
+        flip_ratio_bev_horizontal=0.5,
+        flip_ratio_bev_vertical=0.5),
    dict(
        type='GlobalRotScaleTrans',
        rot_range=[-0.78539816, 0.78539816],
        scale_ratio_range=[0.95, 1.05],
-        translation_std=[0.5, 0.5, 0]),
-    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
-    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
-    dict(type='ObjectNameFilter', classes=class_names),
+        translation_std=[0.5, 0.5, 0.5]),
+    dict(type='PointsRangeFilter3D', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter3D', point_cloud_range=point_cloud_range),
    dict(type='PointShuffle'),
    dict(
        type='Pack3DDetInputs',
@@ -172,25 +174,34 @@ test_pipeline = [
        norm_intensity=True,
        norm_elongation=True,
        backend_args=backend_args),
+    dict(type='PointsRangeFilter3D', point_cloud_range=point_cloud_range),
    dict(
-        type='MultiScaleFlipAug3D',
-        img_scale=(1333, 800),
-        pts_scale_ratio=1,
-        flip=False,
-        transforms=[
-            dict(
-                type='GlobalRotScaleTrans',
-                rot_range=[0, 0],
-                scale_ratio_range=[1., 1.],
-                translation_std=[0, 0, 0]),
-            dict(type='RandomFlip3D'),
-            dict(
-                type='PointsRangeFilter', point_cloud_range=point_cloud_range)
-        ]),
-    dict(type='Pack3DDetInputs', keys=['points'])
+        type='Pack3DDetInputs',
+        keys=['points'],
+        meta_keys=['box_type_3d', 'sample_idx', 'context_name', 'timestamp'])
 ]

 dataset_type = 'WaymoDataset'
+train_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='waymo_infos_train.pkl',
+        data_prefix=dict(pts='training/velodyne', sweeps='training/velodyne'),
+        pipeline=train_pipeline,
+        modality=input_modality,
+        test_mode=False,
+        metainfo=metainfo,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d='LiDAR',
+        # load one frame every five frames
+        load_interval=5,
+        backend_args=backend_args))
 val_dataloader = dict(
    batch_size=4,
    num_workers=4,
@@ -212,18 +223,59 @@ test_dataloader = val_dataloader

 val_evaluator = dict(
    type='WaymoMetric',
-    ann_file='./data/waymo/kitti_format/waymo_infos_val.pkl',
    waymo_bin_file='./data/waymo/waymo_format/gt.bin',
-    data_root='./data/waymo/waymo_format',
-    backend_args=backend_args,
-    convert_kitti_format=False,
-    idx2metainfo='./data/waymo/waymo_format/idx2metainfo.pkl')
+    result_prefix='./dsvt_pred')
 test_evaluator = val_evaluator

 vis_backends = [dict(type='LocalVisBackend')]
 visualizer = dict(
    type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')

+# schedules
+lr = 1e-5
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=lr, weight_decay=0.05, betas=(0.9, 0.99)),
+    clip_grad=dict(max_norm=10, norm_type=2))
+param_scheduler = [
+    dict(
+        type='CosineAnnealingLR',
+        T_max=1.2,
+        eta_min=lr * 100,
+        begin=0,
+        end=1.2,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=10.8,
+        eta_min=lr * 1e-4,
+        begin=1.2,
+        end=12,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    # momentum scheduler
+    dict(
+        type='CosineAnnealingMomentum',
+        T_max=1.2,
+        eta_min=0.85,
+        begin=0,
+        end=1.2,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingMomentum',
+        T_max=10.8,
+        eta_min=0.95,
+        begin=1.2,
+        end=12,
+        by_epoch=True,
+        convert_to_iter_based=True)
+]
+
+# runtime settings
+train_cfg = dict(by_epoch=True, max_epochs=12, val_interval=1)
+
 # runtime settings
 val_cfg = dict()
 test_cfg = dict()
@@ -236,4 +288,12 @@ test_cfg = dict()

 default_hooks = dict(
    logger=dict(type='LoggerHook', interval=50),
-    checkpoint=dict(type='CheckpointHook', interval=5))
+    checkpoint=dict(type='CheckpointHook', interval=1))
+custom_hooks = [
+    dict(
+        type='DisableAugHook',
+        disable_after_epoch=11,
+        disable_aug_list=[
+            'GlobalRotScaleTrans', 'RandomFlip3D', 'ObjectSample'
+        ])
+]
--- a/projects/DSVT/dsvt/__init__.py
+++ b/projects/DSVT/dsvt/__init__.py
+from .disable_aug_hook import DisableAugHook
 from .dsvt import DSVT
 from .dsvt_head import DSVTCenterHead
 from .dsvt_transformer import DSVTMiddleEncoder
 from .dynamic_pillar_vfe import DynamicPillarVFE3D
 from .map2bev import PointPillarsScatter3D
 from .res_second import ResSECOND
+from .transforms_3d import ObjectRangeFilter3D, PointsRangeFilter3D
 from .utils import DSVTBBoxCoder

 __all__ = [
    'DSVTCenterHead', 'DSVT', 'DSVTMiddleEncoder', 'DynamicPillarVFE3D',
-    'PointPillarsScatter3D', 'ResSECOND', 'DSVTBBoxCoder'
+    'PointPillarsScatter3D', 'ResSECOND', 'DSVTBBoxCoder',
+    'ObjectRangeFilter3D', 'PointsRangeFilter3D', 'DisableAugHook'
 ]
--- a/projects/DSVT/dsvt/disable_aug_hook.py
+++ b/projects/DSVT/dsvt/disable_aug_hook.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+from mmengine.dataset import BaseDataset
+from mmengine.hooks import Hook
+from mmengine.model import is_model_wrapper
+from mmengine.runner import Runner
+
+from mmdet3d.registry import HOOKS
+
+
+@HOOKS.register_module()
+class DisableAugHook(Hook):
+    """The hook of disabling augmentations during training.
+
+    Args:
+        disable_after_epoch (int): The number of epochs after which
+            the data augmentation will be closed in the training.
+            Defaults to 15.
+        disable_aug_list (list): the list of data augmentation will
+            be closed in the training. Defaults to [].
+    """
+
+    def __init__(self,
+                 disable_after_epoch: int = 15,
+                 disable_aug_list: List = []):
+        self.disable_after_epoch = disable_after_epoch
+        self.disable_aug_list = disable_aug_list
+        self._restart_dataloader = False
+
+    def before_train_epoch(self, runner: Runner):
+        """Close augmentation.
+
+        Args:
+            runner (Runner): The runner.
+        """
+        epoch = runner.epoch
+        train_loader = runner.train_dataloader
+        model = runner.model
+        # TODO: refactor after mmengine using model wrapper
+        if is_model_wrapper(model):
+            model = model.module
+        if epoch == self.disable_after_epoch:
+
+            dataset = runner.train_dataloader.dataset
+            # handle dataset wrapper
+            if not isinstance(dataset, BaseDataset):
+                dataset = dataset.dataset
+            new_transforms = []
+            for transform in dataset.pipeline.transforms:  # noqa: E501
+                if transform.__class__.__name__ not in self.disable_aug_list:
+                    new_transforms.append(transform)
+                else:
+                    runner.logger.info(
+                        f'Disable {transform.__class__.__name__}')
+            dataset.pipeline.transforms = new_transforms
+            # The dataset pipeline cannot be updated when persistent_workers
+            # is True, so we need to force the dataloader's multi-process
+            # restart. This is a very hacky approach.
+            if hasattr(train_loader, 'persistent_workers'
+                       ) and train_loader.persistent_workers is True:
+                train_loader._DataLoader__initialized = False
+                train_loader._iterator = None
+                self._restart_dataloader = True
+        else:
+            # Once the restart is complete, we need to restore
+            # the initialization flag.
+            if self._restart_dataloader:
+                train_loader._DataLoader__initialized = True
--- a/projects/DSVT/dsvt/dsvt.py
+++ b/projects/DSVT/dsvt/dsvt.py
@@ -103,7 +103,11 @@ class DSVT(Base3DDetector):
        Returns:
            dict[str, Tensor]: A dictionary of loss components.
        """
-        pass
+        pts_feats = self.extract_feat(batch_inputs_dict)
+        losses = dict()
+        loss = self.bbox_head.loss(pts_feats, batch_data_samples)
+        losses.update(loss)
+        return losses

    def predict(self, batch_inputs_dict: Dict[str, Optional[Tensor]],
                batch_data_samples: List[Det3DDataSample],

--- a/projects/DSVT/dsvt/dsvt_head.py
+++ b/projects/DSVT/dsvt/dsvt_head.py
+import math
 from typing import Dict, List, Tuple

 import torch
+import torch.nn as nn
+from mmcv.ops import boxes_iou3d
 from mmdet.models.utils import multi_apply
+from mmengine.model import kaiming_init
 from mmengine.structures import InstanceData
 from torch import Tensor
+from torch.nn.init import constant_

 from mmdet3d.models import CenterHead
 from mmdet3d.models.layers import circle_nms, nms_bev
+from mmdet3d.models.utils import (clip_sigmoid, draw_heatmap_gaussian,
+                                  gaussian_radius)
 from mmdet3d.registry import MODELS
 from mmdet3d.structures import Det3DDataSample, xywhr2xyxyr

@@ -18,8 +25,33 @@ class DSVTCenterHead(CenterHead):
    This head adds IoU prediction branch based on the original CenterHead.
    """

-    def __init__(self, *args, **kwargs):
+    def __init__(self,
+                 loss_iou=dict(
+                     type='mmdet.L1Loss', reduction='mean', loss_weight=1),
+                 loss_reg_iou=None,
+                 *args,
+                 **kwargs):
        super(DSVTCenterHead, self).__init__(*args, **kwargs)
+        self.loss_iou = MODELS.build(loss_iou)
+        self.loss_iou_reg = MODELS.build(
+            loss_reg_iou) if loss_reg_iou is not None else None
+
+    def init_weights(self):
+        kaiming_init(
+            self.shared_conv.conv,
+            a=math.sqrt(5),
+            mode='fan_in',
+            nonlinearity='leaky_relu',
+            distribution='uniform')
+        for head in self.task_heads[0].heads:
+            if head == 'heatmap':
+                constant_(self.task_heads[0].__getattr__(head)[-1].bias,
+                          self.task_heads[0].init_bias)
+            else:
+                for m in self.task_heads[0].__getattr__(head).modules():
+                    if isinstance(m, nn.Conv2d):
+                        kaiming_init(
+                            m, mode='fan_in', nonlinearity='leaky_relu')

    def forward_single(self, x: Tensor) -> dict:
        """Forward function for CenterPoint.
@@ -66,7 +98,298 @@ class DSVTCenterHead(CenterHead):
        Returns:
            dict: Losses of each branch.
        """
-        pass
+        outs = self(pts_feats)
+        batch_gt_instance_3d = []
+        for data_sample in batch_data_samples:
+            batch_gt_instance_3d.append(data_sample.gt_instances_3d)
+        losses = self.loss_by_feat(outs, batch_gt_instance_3d)
+        return losses
+
+    def _decode_all_preds(self,
+                          pred_dict,
+                          point_cloud_range=None,
+                          voxel_size=None):
+        batch_size, _, H, W = pred_dict['reg'].shape
+
+        batch_center = pred_dict['reg'].permute(0, 2, 3, 1).contiguous().view(
+            batch_size, H * W, 2)  # (B, H, W, 2)
+        batch_center_z = pred_dict['height'].permute(
+            0, 2, 3, 1).contiguous().view(batch_size, H * W, 1)  # (B, H, W, 1)
+        batch_dim = pred_dict['dim'].exp().permute(
+            0, 2, 3, 1).contiguous().view(batch_size, H * W, 3)  # (B, H, W, 3)
+        batch_rot_cos = pred_dict['rot'][:, 0].unsqueeze(dim=1).permute(
+            0, 2, 3, 1).contiguous().view(batch_size, H * W, 1)  # (B, H, W, 1)
+        batch_rot_sin = pred_dict['rot'][:, 1].unsqueeze(dim=1).permute(
+            0, 2, 3, 1).contiguous().view(batch_size, H * W, 1)  # (B, H, W, 1)
+        batch_vel = pred_dict['vel'].permute(0, 2, 3, 1).contiguous().view(
+            batch_size, H * W, 2) if 'vel' in pred_dict.keys() else None
+
+        angle = torch.atan2(batch_rot_sin, batch_rot_cos)  # (B, H*W, 1)
+
+        ys, xs = torch.meshgrid([
+            torch.arange(
+                0, H, device=batch_center.device, dtype=batch_center.dtype),
+            torch.arange(
+                0, W, device=batch_center.device, dtype=batch_center.dtype)
+        ])
+        ys = ys.view(1, H, W).repeat(batch_size, 1, 1)
+        xs = xs.view(1, H, W).repeat(batch_size, 1, 1)
+        xs = xs.view(batch_size, -1, 1) + batch_center[:, :, 0:1]
+        ys = ys.view(batch_size, -1, 1) + batch_center[:, :, 1:2]
+
+        xs = xs * voxel_size[0] + point_cloud_range[0]
+        ys = ys * voxel_size[1] + point_cloud_range[1]
+
+        box_part_list = [xs, ys, batch_center_z, batch_dim, angle]
+        if batch_vel is not None:
+            box_part_list.append(batch_vel)
+
+        box_preds = torch.cat((box_part_list),
+                              dim=-1).view(batch_size, H, W, -1)
+
+        return box_preds
+
+    def _transpose_and_gather_feat(self, feat, ind):
+        feat = feat.permute(0, 2, 3, 1).contiguous()
+        feat = feat.view(feat.size(0), -1, feat.size(3))
+        feat = self._gather_feat(feat, ind)
+        return feat
+
+    def calc_iou_loss(self, iou_preds, batch_box_preds, mask, ind, gt_boxes):
+        """
+        Args:
+            iou_preds: (batch x 1 x h x w)
+            batch_box_preds: (batch x (7 or 9) x h x w)
+            mask: (batch x max_objects)
+            ind: (batch x max_objects)
+            gt_boxes: List of batch groundtruth boxes.
+
+        Returns:
+            Tensor: IoU Loss.
+        """
+        if mask.sum() == 0:
+            return iou_preds.new_zeros((1))
+
+        mask = mask.bool()
+        selected_iou_preds = self._transpose_and_gather_feat(iou_preds,
+                                                             ind)[mask]
+
+        selected_box_preds = self._transpose_and_gather_feat(
+            batch_box_preds, ind)[mask]
+        gt_boxes = torch.cat(gt_boxes)
+        assert gt_boxes.size(0) == selected_box_preds.size(0)
+        iou_target = boxes_iou3d(selected_box_preds[:, 0:7], gt_boxes[:, 0:7])
+        iou_target = torch.diag(iou_target).view(-1)
+        iou_target = iou_target * 2 - 1  # [0, 1] ==> [-1, 1]
+
+        loss = self.loss_iou(selected_iou_preds.view(-1), iou_target)
+        loss = loss / torch.clamp(mask.sum(), min=1e-4)
+        return loss
+
+    def calc_iou_reg_loss(self, batch_box_preds, mask, ind, gt_boxes):
+        if mask.sum() == 0:
+            return batch_box_preds.new_zeros((1))
+
+        mask = mask.bool()
+
+        selected_box_preds = self._transpose_and_gather_feat(
+            batch_box_preds, ind)[mask]
+        gt_boxes = torch.cat(gt_boxes)
+        assert gt_boxes.size(0) == selected_box_preds.size(0)
+        loss = self.loss_iou_reg(selected_box_preds[:, 0:7], gt_boxes[:, 0:7])
+
+        return loss
+
+    def get_targets(
+        self,
+        batch_gt_instances_3d: List[InstanceData],
+    ) -> Tuple[List[Tensor]]:
+        """Generate targets.
+
+        How each output is transformed:
+
+            Each nested list is transposed so that all same-index elements in
+            each sub-list (1, ..., N) become the new sub-lists.
+                [ [a0, a1, a2, ... ], [b0, b1, b2, ... ], ... ]
+                ==> [ [a0, b0, ... ], [a1, b1, ... ], [a2, b2, ... ] ]
+
+            The new transposed nested list is converted into a list of N
+            tensors generated by concatenating tensors in the new sub-lists.
+                [ tensor0, tensor1, tensor2, ... ]
+
+        Args:
+            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
+                gt_instances. It usually includes ``bboxes_3d`` and\
+                ``labels_3d`` attributes.
+
+        Returns:
+            Returns:
+                tuple[list[torch.Tensor]]: Tuple of target including
+                    the following results in order.
+
+                - list[torch.Tensor]: Heatmap scores.
+                - list[torch.Tensor]: Ground truth boxes.
+                - list[torch.Tensor]: Indexes indicating the
+                    position of the valid boxes.
+                - list[torch.Tensor]: Masks indicating which
+                    boxes are valid.
+        """
+        heatmaps, anno_boxes, inds, masks, task_gt_bboxes = multi_apply(
+            self.get_targets_single, batch_gt_instances_3d)
+        # Transpose heatmaps
+        heatmaps = list(map(list, zip(*heatmaps)))
+        heatmaps = [torch.stack(hms_) for hms_ in heatmaps]
+        # Transpose anno_boxes
+        anno_boxes = list(map(list, zip(*anno_boxes)))
+        anno_boxes = [torch.stack(anno_boxes_) for anno_boxes_ in anno_boxes]
+        # Transpose inds
+        inds = list(map(list, zip(*inds)))
+        inds = [torch.stack(inds_) for inds_ in inds]
+        # Transpose masks
+        masks = list(map(list, zip(*masks)))
+        masks = [torch.stack(masks_) for masks_ in masks]
+        # Transpose task_gt_bboxes
+        task_gt_bboxes = list(map(list, zip(*task_gt_bboxes)))
+        return heatmaps, anno_boxes, inds, masks, task_gt_bboxes
+
+    def get_targets_single(self,
+                           gt_instances_3d: InstanceData) -> Tuple[Tensor]:
+        """Generate training targets for a single sample.
+
+        Args:
+            gt_instances_3d (:obj:`InstanceData`): Gt_instances_3d of
+                single data sample. It usually includes
+                ``bboxes_3d`` and ``labels_3d`` attributes.
+
+        Returns:
+            tuple[list[torch.Tensor]]: Tuple of target including
+                the following results in order.
+
+                - list[torch.Tensor]: Heatmap scores.
+                - list[torch.Tensor]: Ground truth boxes.
+                - list[torch.Tensor]: Indexes indicating the position
+                    of the valid boxes.
+                - list[torch.Tensor]: Masks indicating which boxes
+                    are valid.
+        """
+        gt_labels_3d = gt_instances_3d.labels_3d
+        gt_bboxes_3d = gt_instances_3d.bboxes_3d
+        device = gt_labels_3d.device
+        gt_bboxes_3d = torch.cat(
+            (gt_bboxes_3d.gravity_center, gt_bboxes_3d.tensor[:, 3:]),
+            dim=1).to(device)
+        max_objs = self.train_cfg['max_objs'] * self.train_cfg['dense_reg']
+        grid_size = torch.tensor(self.train_cfg['grid_size']).to(device)
+        pc_range = torch.tensor(self.train_cfg['point_cloud_range'])
+        voxel_size = torch.tensor(self.train_cfg['voxel_size'])
+
+        feature_map_size = grid_size[:2] // self.train_cfg['out_size_factor']
+
+        # reorganize the gt_dict by tasks
+        task_masks = []
+        flag = 0
+        for class_name in self.class_names:
+            task_masks.append([
+                torch.where(gt_labels_3d == class_name.index(i) + flag)
+                for i in class_name
+            ])
+            flag += len(class_name)
+
+        task_boxes = []
+        task_classes = []
+        flag2 = 0
+        for idx, mask in enumerate(task_masks):
+            task_box = []
+            task_class = []
+            for m in mask:
+                task_box.append(gt_bboxes_3d[m])
+                # 0 is background for each task, so we need to add 1 here.
+                task_class.append(gt_labels_3d[m] + 1 - flag2)
+            task_boxes.append(torch.cat(task_box, axis=0).to(device))
+            task_classes.append(torch.cat(task_class).long().to(device))
+            flag2 += len(mask)
+        draw_gaussian = draw_heatmap_gaussian
+        heatmaps, anno_boxes, inds, masks = [], [], [], []
+
+        for idx, task_head in enumerate(self.task_heads):
+            heatmap = gt_bboxes_3d.new_zeros(
+                (len(self.class_names[idx]), feature_map_size[1],
+                 feature_map_size[0]))
+
+            anno_box = gt_bboxes_3d.new_zeros((max_objs, 8),
+                                              dtype=torch.float32)
+
+            ind = gt_labels_3d.new_zeros((max_objs), dtype=torch.int64)
+            mask = gt_bboxes_3d.new_zeros((max_objs), dtype=torch.uint8)
+
+            num_objs = min(task_boxes[idx].shape[0], max_objs)
+
+            for k in range(num_objs):
+                cls_id = task_classes[idx][k] - 1
+
+                length = task_boxes[idx][k][3]
+                width = task_boxes[idx][k][4]
+                length = length / voxel_size[0] / self.train_cfg[
+                    'out_size_factor']
+                width = width / voxel_size[1] / self.train_cfg[
+                    'out_size_factor']
+
+                if width > 0 and length > 0:
+                    radius = gaussian_radius(
+                        (width, length),
+                        min_overlap=self.train_cfg['gaussian_overlap'])
+                    radius = max(self.train_cfg['min_radius'], int(radius))
+
+                    # be really careful for the coordinate system of
+                    # your box annotation.
+                    x, y, z = task_boxes[idx][k][0], task_boxes[idx][k][
+                        1], task_boxes[idx][k][2]
+
+                    coor_x = (
+                        x - pc_range[0]
+                    ) / voxel_size[0] / self.train_cfg['out_size_factor']
+                    coor_y = (
+                        y - pc_range[1]
+                    ) / voxel_size[1] / self.train_cfg['out_size_factor']
+
+                    center = torch.tensor([coor_x, coor_y],
+                                          dtype=torch.float32,
+                                          device=device)
+                    center_int = center.to(torch.int32)
+
+                    # throw out not in range objects to avoid out of array
+                    # area when creating the heatmap
+                    if not (0 <= center_int[0] < feature_map_size[0]
+                            and 0 <= center_int[1] < feature_map_size[1]):
+                        continue
+
+                    draw_gaussian(heatmap[cls_id], center_int, radius)
+
+                    new_idx = k
+                    x, y = center_int[0], center_int[1]
+
+                    assert (y * feature_map_size[0] + x <
+                            feature_map_size[0] * feature_map_size[1])
+
+                    ind[new_idx] = y * feature_map_size[0] + x
+                    mask[new_idx] = 1
+                    # TODO: support other outdoor dataset
+                    rot = task_boxes[idx][k][6]
+                    box_dim = task_boxes[idx][k][3:6]
+                    if self.norm_bbox:
+                        box_dim = box_dim.log()
+                    anno_box[new_idx] = torch.cat([
+                        center - torch.tensor([x, y], device=device),
+                        z.unsqueeze(0), box_dim,
+                        torch.cos(rot).unsqueeze(0),
+                        torch.sin(rot).unsqueeze(0)
+                    ])
+
+            heatmaps.append(heatmap)
+            anno_boxes.append(anno_box)
+            masks.append(mask)
+            inds.append(ind)
+        return heatmaps, anno_boxes, inds, masks, task_boxes

    def loss_by_feat(self, preds_dicts: Tuple[List[dict]],
                     batch_gt_instances_3d: List[InstanceData], *args,
@@ -79,13 +402,72 @@ class DSVTCenterHead(CenterHead):
                tasks head, and the internal list indicate different
                FPN level.
            batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
-                gt_instances. It usually includes ``bboxes_3d`` and\
+                gt_instances_3d. It usually includes ``bboxes_3d`` and
                ``labels_3d`` attributes.

        Returns:
            dict[str,torch.Tensor]: Loss of heatmap and bbox of each task.
        """
-        pass
+        heatmaps, anno_boxes, inds, masks, task_gt_bboxes = self.get_targets(
+            batch_gt_instances_3d)
+        loss_dict = dict()
+        for task_id, preds_dict in enumerate(preds_dicts):
+            # heatmap focal loss
+            preds_dict[0]['heatmap'] = clip_sigmoid(preds_dict[0]['heatmap'])
+            num_pos = heatmaps[task_id].eq(1).float().sum().item()
+            loss_heatmap = self.loss_cls(
+                preds_dict[0]['heatmap'],
+                heatmaps[task_id],
+                avg_factor=max(num_pos, 1))
+            target_box = anno_boxes[task_id]
+            # reconstruct the anno_box from multiple reg heads
+            preds_dict[0]['anno_box'] = torch.cat(
+                (preds_dict[0]['reg'], preds_dict[0]['height'],
+                 preds_dict[0]['dim'], preds_dict[0]['rot']),
+                dim=1)
+
+            # Regression loss for dimension, offset, height, rotation
+            ind = inds[task_id]
+            num = masks[task_id].float().sum()
+            pred = preds_dict[0]['anno_box'].permute(0, 2, 3, 1).contiguous()
+            pred = pred.view(pred.size(0), -1, pred.size(3))
+            pred = self._gather_feat(pred, ind)
+            mask = masks[task_id].unsqueeze(2).expand_as(target_box).float()
+            isnotnan = (~torch.isnan(target_box)).float()
+            mask *= isnotnan
+
+            code_weights = self.train_cfg.get('code_weights', None)
+            bbox_weights = mask * mask.new_tensor(code_weights)
+            loss_bbox = self.loss_bbox(
+                pred, target_box, bbox_weights, avg_factor=(num + 1e-4))
+            loss_dict[f'task{task_id}.loss_heatmap'] = loss_heatmap
+            loss_dict[f'task{task_id}.loss_bbox'] = loss_bbox
+
+            if 'iou' in preds_dict[0]:
+                batch_box_preds = self._decode_all_preds(
+                    pred_dict=preds_dict[0],
+                    point_cloud_range=self.train_cfg['point_cloud_range'],
+                    voxel_size=self.train_cfg['voxel_size']
+                )  # (B, H, W, 7 or 9)
+
+                batch_box_preds_for_iou = batch_box_preds.permute(
+                    0, 3, 1, 2)  # (B, 7 or 9, H, W)
+                loss_dict[f'task{task_id}.loss_iou'] = self.calc_iou_loss(
+                    iou_preds=preds_dict[0]['iou'],
+                    batch_box_preds=batch_box_preds_for_iou.clone().detach(),
+                    mask=masks[task_id],
+                    ind=ind,
+                    gt_boxes=task_gt_bboxes[task_id])
+
+                if self.loss_iou_reg is not None:
+                    loss_dict[f'task{task_id}.loss_reg_iou'] = \
+                        self.calc_iou_reg_loss(
+                            batch_box_preds=batch_box_preds_for_iou,
+                            mask=masks[task_id],
+                            ind=ind,
+                            gt_boxes=task_gt_bboxes[task_id])
+
+        return loss_dict

    def predict(self,
                pts_feats: Tuple[torch.Tensor],
@@ -158,6 +540,7 @@ class DSVTCenterHead(CenterHead):
            else:
                batch_dim = preds_dict[0]['dim']

+            # It's different from CenterHead
            batch_rotc = preds_dict[0]['rot'][:, 0].unsqueeze(1)
            batch_rots = preds_dict[0]['rot'][:, 1].unsqueeze(1)
            batch_iou = (preds_dict[0]['iou'] +

--- a/projects/DSVT/dsvt/dynamic_pillar_vfe.py
+++ b/projects/DSVT/dsvt/dynamic_pillar_vfe.py
 # modified from https://github.com/Haiyang-W/DSVT
+import numpy as np
 import torch
 import torch.nn as nn
 import torch_scatter
@@ -76,6 +77,7 @@ class DynamicPillarVFE3D(nn.Module):
        self.voxel_x = voxel_size[0]
        self.voxel_y = voxel_size[1]
        self.voxel_z = voxel_size[2]
+        point_cloud_range = np.array(point_cloud_range).astype(np.float32)
        self.x_offset = self.voxel_x / 2 + point_cloud_range[0]
        self.y_offset = self.voxel_y / 2 + point_cloud_range[1]
        self.z_offset = self.voxel_z / 2 + point_cloud_range[2]

--- a/projects/DSVT/dsvt/res_second.py
+++ b/projects/DSVT/dsvt/res_second.py
 # modified from https://github.com/Haiyang-W/DSVT
-
-import warnings
-from typing import Optional, Sequence, Tuple
+from typing import Sequence, Tuple

 from mmengine.model import BaseModule
 from torch import Tensor
@@ -78,8 +76,8 @@ class ResSECOND(BaseModule):
        out_channels (list[int]): Output channels for multi-scale feature maps.
        blocks_nums (list[int]): Number of blocks in each stage.
        layer_strides (list[int]): Strides of each stage.
-        norm_cfg (dict): Config dict of normalization layers.
-        conv_cfg (dict): Config dict of convolutional layers.
+        init_cfg (dict, optional): Config for weight initialization.
+            Defaults to None.
    """

    def __init__(self,
@@ -87,8 +85,7 @@ class ResSECOND(BaseModule):
                 out_channels: Sequence[int] = [128, 128, 256],
                 blocks_nums: Sequence[int] = [1, 2, 2],
                 layer_strides: Sequence[int] = [2, 2, 2],
-                 init_cfg: OptMultiConfig = None,
-                 pretrained: Optional[str] = None) -> None:
+                 init_cfg: OptMultiConfig = None) -> None:
        super(ResSECOND, self).__init__(init_cfg=init_cfg)
        assert len(layer_strides) == len(blocks_nums)
        assert len(out_channels) == len(blocks_nums)
@@ -108,14 +105,6 @@ class ResSECOND(BaseModule):
                    BasicResBlock(out_channels[i], out_channels[i]))
            blocks.append(nn.Sequential(*cur_layers))
        self.blocks = nn.Sequential(*blocks)
-        assert not (init_cfg and pretrained), \
-            'init_cfg and pretrained cannot be setting at the same time'
-        if isinstance(pretrained, str):
-            warnings.warn('DeprecationWarning: pretrained is a deprecated, '
-                          'please use "init_cfg" instead')
-            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
-        else:
-            self.init_cfg = dict(type='Kaiming', layer='Conv2d')

    def forward(self, x: Tensor) -> Tuple[Tensor, ...]:
        """Forward function.

--- a/projects/DSVT/dsvt/transforms_3d.py
+++ b/projects/DSVT/dsvt/transforms_3d.py
+from typing import List
+
+import numpy as np
+from mmcv import BaseTransform
+
+from mmdet3d.registry import TRANSFORMS
+
+
+@TRANSFORMS.register_module()
+class ObjectRangeFilter3D(BaseTransform):
+    """Filter objects by the range. It differs from `ObjectRangeFilter` by
+    using `in_range_3d` instead of `in_range_bev`.
+
+    Required Keys:
+
+    - gt_bboxes_3d
+
+    Modified Keys:
+
+    - gt_bboxes_3d
+
+    Args:
+        point_cloud_range (list[float]): Point cloud range.
+    """
+
+    def __init__(self, point_cloud_range: List[float]) -> None:
+        self.pcd_range = np.array(point_cloud_range, dtype=np.float32)
+
+    def transform(self, input_dict: dict) -> dict:
+        """Transform function to filter objects by the range.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after filtering, 'gt_bboxes_3d', 'gt_labels_3d'
+            keys are updated in the result dict.
+        """
+        gt_bboxes_3d = input_dict['gt_bboxes_3d']
+        gt_labels_3d = input_dict['gt_labels_3d']
+        mask = gt_bboxes_3d.in_range_3d(self.pcd_range)
+        gt_bboxes_3d = gt_bboxes_3d[mask]
+        # mask is a torch tensor but gt_labels_3d is still numpy array
+        # using mask to index gt_labels_3d will cause bug when
+        # len(gt_labels_3d) == 1, where mask=1 will be interpreted
+        # as gt_labels_3d[1] and cause out of index error
+        gt_labels_3d = gt_labels_3d[mask.numpy().astype(bool)]
+
+        # limit rad to [-pi, pi]
+        gt_bboxes_3d.limit_yaw(offset=0.5, period=2 * np.pi)
+        input_dict['gt_bboxes_3d'] = gt_bboxes_3d
+        input_dict['gt_labels_3d'] = gt_labels_3d
+
+        return input_dict
+
+    def __repr__(self) -> str:
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(point_cloud_range={self.pcd_range.tolist()})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class PointsRangeFilter3D(BaseTransform):
+    """Filter points by the range. It differs from `PointRangeFilter` by using
+    `in_range_bev` instead of `in_range_3d`.
+
+    Required Keys:
+
+    - points
+    - pts_instance_mask (optional)
+
+    Modified Keys:
+
+    - points
+    - pts_instance_mask (optional)
+
+    Args:
+        point_cloud_range (list[float]): Point cloud range.
+    """
+
+    def __init__(self, point_cloud_range: List[float]) -> None:
+        self.pcd_range = np.array(point_cloud_range, dtype=np.float32)
+
+    def transform(self, input_dict: dict) -> dict:
+        """Transform function to filter points by the range.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after filtering, 'points', 'pts_instance_mask'
+            and 'pts_semantic_mask' keys are updated in the result dict.
+        """
+        points = input_dict['points']
+        points_mask = points.in_range_bev(self.pcd_range[[0, 1, 3, 4]])
+        clean_points = points[points_mask]
+        input_dict['points'] = clean_points
+        points_mask = points_mask.numpy()
+
+        pts_instance_mask = input_dict.get('pts_instance_mask', None)
+        pts_semantic_mask = input_dict.get('pts_semantic_mask', None)
+
+        if pts_instance_mask is not None:
+            input_dict['pts_instance_mask'] = pts_instance_mask[points_mask]
+
+        if pts_semantic_mask is not None:
+            input_dict['pts_semantic_mask'] = pts_semantic_mask[points_mask]
+
+        return input_dict
+
+    def __repr__(self) -> str:
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(point_cloud_range={self.pcd_range.tolist()})'
+        return repr_str
--- a/projects/DSVT/dsvt/utils.py
+++ b/projects/DSVT/dsvt/utils.py
@@ -3,10 +3,11 @@ from typing import Dict, List, Optional
 import numpy as np
 import torch
 import torch.nn as nn
+from mmdet.models.losses.utils import weighted_loss
 from torch import Tensor

 from mmdet3d.models.task_modules import CenterPointBBoxCoder
-from mmdet3d.registry import TASK_UTILS
+from mmdet3d.registry import MODELS, TASK_UTILS
 from .ops.ingroup_inds.ingroup_inds_op import ingroup_inds

 get_inner_win_inds_cuda = ingroup_inds
@@ -266,7 +267,7 @@ class DSVTBBoxCoder(CenterPointBBoxCoder):
            thresh_mask = final_scores > self.score_threshold

        if self.post_center_range is not None:
-            self.post_center_range = torch.tensor(
+            self.post_center_range = torch.as_tensor(
                self.post_center_range, device=heat.device)
            mask = (final_box_preds[..., :3] >=
                    self.post_center_range[:3]).all(2)
@@ -298,3 +299,142 @@ class DSVTBBoxCoder(CenterPointBBoxCoder):
                'support post_center_range is not None for now!')

        return predictions_dicts
+
+
+def center_to_corner2d(center, dim):
+    corners_norm = torch.tensor(
+        [[-0.5, -0.5], [-0.5, 0.5], [0.5, 0.5], [0.5, -0.5]],
+        device=dim.device).type_as(center)  # (4, 2)
+    corners = dim.view([-1, 1, 2]) * corners_norm.view([1, 4, 2])  # (N, 4, 2)
+    corners = corners + center.view(-1, 1, 2)
+    return corners
+
+
+@weighted_loss
+def diou3d_loss(pred_boxes, gt_boxes, eps: float = 1e-7):
+    """
+    modified from https://github.com/agent-sgs/PillarNet/blob/master/det3d/core/utils/center_utils.py # noqa
+    Args:
+        pred_boxes (N, 7):
+        gt_boxes (N, 7):
+
+    Returns:
+        Tensor: Distance-IoU Loss.
+    """
+    assert pred_boxes.shape[0] == gt_boxes.shape[0]
+
+    qcorners = center_to_corner2d(pred_boxes[:, :2],
+                                  pred_boxes[:, 3:5])  # (N, 4, 2)
+    gcorners = center_to_corner2d(gt_boxes[:, :2], gt_boxes[:,
+                                                            3:5])  # (N, 4, 2)
+
+    inter_max_xy = torch.minimum(qcorners[:, 2], gcorners[:, 2])
+    inter_min_xy = torch.maximum(qcorners[:, 0], gcorners[:, 0])
+    out_max_xy = torch.maximum(qcorners[:, 2], gcorners[:, 2])
+    out_min_xy = torch.minimum(qcorners[:, 0], gcorners[:, 0])
+
+    # calculate area
+    volume_pred_boxes = pred_boxes[:, 3] * pred_boxes[:, 4] * pred_boxes[:, 5]
+    volume_gt_boxes = gt_boxes[:, 3] * gt_boxes[:, 4] * gt_boxes[:, 5]
+
+    inter_h = torch.minimum(
+        pred_boxes[:, 2] + 0.5 * pred_boxes[:, 5],
+        gt_boxes[:, 2] + 0.5 * gt_boxes[:, 5]) - torch.maximum(
+            pred_boxes[:, 2] - 0.5 * pred_boxes[:, 5],
+            gt_boxes[:, 2] - 0.5 * gt_boxes[:, 5])
+    inter_h = torch.clamp(inter_h, min=0)
+
+    inter = torch.clamp((inter_max_xy - inter_min_xy), min=0)
+    volume_inter = inter[:, 0] * inter[:, 1] * inter_h
+    volume_union = volume_gt_boxes + volume_pred_boxes - volume_inter + eps
+
+    # boxes_iou3d_gpu(pred_boxes, gt_boxes)
+    inter_diag = torch.pow(gt_boxes[:, 0:3] - pred_boxes[:, 0:3], 2).sum(-1)
+
+    outer_h = torch.maximum(
+        gt_boxes[:, 2] + 0.5 * gt_boxes[:, 5],
+        pred_boxes[:, 2] + 0.5 * pred_boxes[:, 5]) - torch.minimum(
+            gt_boxes[:, 2] - 0.5 * gt_boxes[:, 5],
+            pred_boxes[:, 2] - 0.5 * pred_boxes[:, 5])
+    outer_h = torch.clamp(outer_h, min=0)
+    outer = torch.clamp((out_max_xy - out_min_xy), min=0)
+    outer_diag = outer[:, 0]**2 + outer[:, 1]**2 + outer_h**2 + eps
+
+    dious = volume_inter / volume_union - inter_diag / outer_diag
+    dious = torch.clamp(dious, min=-1.0, max=1.0)
+
+    loss = 1 - dious
+
+    return loss
+
+
+@MODELS.register_module()
+class DIoU3DLoss(nn.Module):
+    r"""3D bboxes Implementation of `Distance-IoU Loss: Faster and Better
+    Learning for Bounding Box Regression <https://arxiv.org/abs/1911.08287>`_.
+
+    Code is modified from https://github.com/Zzh-tju/DIoU.
+
+    Args:
+        eps (float): Epsilon to avoid log(0). Defaults to 1e-6.
+        reduction (str): Options are "none", "mean" and "sum".
+            Defaults to "mean".
+        loss_weight (float): Weight of loss. Defaults to 1.0.
+    """
+
+    def __init__(self,
+                 eps: float = 1e-6,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0) -> None:
+        super().__init__()
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred: Tensor,
+                target: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[int] = None,
+                reduction_override: Optional[str] = None,
+                **kwargs) -> Tensor:
+        """Forward function.
+
+        Args:
+            pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+                shape (n, 4).
+            target (Tensor): The learning target of the prediction,
+                shape (n, 4).
+            weight (Optional[Tensor], optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (Optional[int], optional): Average factor that is used
+                to average the loss. Defaults to None.
+            reduction_override (Optional[str], optional): The reduction method
+                used to override the original reduction method of the loss.
+                Defaults to None. Options are "none", "mean" and "sum".
+
+        Returns:
+            Tensor: Loss tensor.
+        """
+        if weight is not None and not torch.any(weight > 0):
+            if pred.dim() == weight.dim() + 1:
+                weight = weight.unsqueeze(1)
+            return (pred * weight).sum()  # 0
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if weight is not None and weight.dim() > 1:
+            # TODO: remove this in the future
+            # reduce the weight of shape (n, 4) to (n,) to match the
+            # giou_loss of shape (n,)
+            assert weight.shape == pred.shape
+            weight = weight.mean(-1)
+        loss = self.loss_weight * diou3d_loss(
+            pred,
+            target,
+            weight,
+            eps=self.eps,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss
--- a/projects/NeRF-Det/README.md
+++ b/projects/NeRF-Det/README.md
+# NeRF-Det: Learning Geometry-Aware Volumetric Representation for Multi-View 3D Object Detection
+
+> [NeRF-Det: Learning Geometry-Aware Volumetric Representation for Multi-View 3D Object Detection](https://arxiv.org/abs/2307.14620)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+NeRF-Det is a novel method for indoor 3D detection with posed RGB images as input. Unlike existing indoor 3D detection methods that struggle to model scene geometry, NeRF-Det makes novel use of NeRF in an end-to-end manner to explicitly estimate 3D geometry, thereby improving 3D detection performance. Specifically, to avoid the significant extra latency associated with per-scene optimization of NeRF, NeRF-Det introduce sufficient geometry priors to enhance the generalizability of NeRF-MLP. Furthermore, it subtly connect the detection and NeRF branches through a shared MLP, enabling an efficient adaptation of NeRF to detection and yielding geometry-aware volumetric representations for 3D detection. NeRF-Det outperforms state-of-the-arts by 3.9 mAP and 3.1 mAP on the ScanNet and ARKITScenes benchmarks, respectively. The author provide extensive analysis to shed light on how NeRF-Det works. As a result of joint-training design,  NeRF-Det is able to generalize well to unseen scenes for object detection, view synthesis, and depth estimation tasks without requiring per-scene optimization. Code will be available at https://github.com/facebookresearch/NeRF-Det
+
+<div align=center>
+<img src="https://chenfengxu714.github.io/nerfdet/static/images/method-cropped_1.png" width="800"/>
+</div>
+
+## Introduction
+
+This directory contains the implementations of NeRF-Det (https://arxiv.org/abs/2307.14620). Our implementations are built on top of MMdetection3D. We have updated NeRF-Det to be compatible with latest mmdet3d version. The codebase and config files have all changed to adapt to the new mmdet3d version. All previous pretrained models are verified with the result listed below. However, newly trained models are yet to be uploaded.
+
+<!-- Share any information you would like others to know. For example:
+Author: @xxx.
+This is an implementation of \[XXX\]. -->
+
+## Dataset
+
+The format of the scannet dataset in the latest version of mmdet3d only supports the lidar tasks. For NeRF-Det, we need to create the new format of ScanNet Dataset.
+
+Please following the files in mmdet3d to prepare the raw data of ScanNet. After that, please use this command to generate the pkls used in nerfdet.
+
+```bash
+python projects/NeRF-Det/prepare_infos.py --root-path ./data/scannet --out-dir ./data/scannet
+```
+
+The new format of the pkl is organized as below:
+
+- scannet_infos_train.pkl: The train data infos, the detailed info of each scan is as follows:
+  - info\['instances'\]:A list of dict contains all annotations, each dict contains all annotation information of single instance.For the i-th instance:
+    - info\['instances'\]\[i\]\['bbox_3d'\]: List of 6 numbers representing the axis_aligned in depth coordinate system, in (x,y,z,l,w,h) order.
+    - info\['instances'\]\[i\]\['bbox_label_3d'\]: The label of each 3d bounding boxes.
+  - info\['cam2img'\]: The intrinsic matrix.Every scene has one matrix.
+  - info\['lidar2cam'\]: The extrinsic matrixes.Every scene has 300 matrixes.
+  - info\['img_paths'\]: The paths of the 300 rgb pictures.
+  - info\['axis_align_matrix'\]: The align matrix.Every scene has one matrix.
+
+After preparing your scannet dataset pkls,please change the paths in configs to fit your project.
+
+## Train
+
+In MMDet3D's root directory, run the following command to train the model:
+
+```bash
+python tools/train.py projects/NeRF-Det/configs/nerfdet_res50_2x_low_res.py ${WORK_DIR}
+```
+
+## Results and Models
+
+### NeRF-Det
+
+|                            Backbone                             | mAP@25 | mAP@50 |    Log    |
+| :-------------------------------------------------------------: | :----: | :----: | :-------: |
+|      [NeRF-Det-R50](./configs/nerfdet_res50_2x_low_res.py)      |  53.0  |  26.8  | [log](<>) |
+|  [NeRF-Det-R50\*](./configs/nerfdet_res50_2x_low_res_depth.py)  |  52.2  |  28.5  | [log](<>) |
+| [NeRF-Det-R101\*](./configs/nerfdet_res101_2x_low_res_depth.py) |  52.3  |  28.5  | [log](<>) |
+
+(Here NeRF-Det-R50\* means this model uses depth information in the training step)
+
+### Notes
+
+- The values showed in the chart all represents the best mAP in the training.
+
+- Since there is a lot of randomness in the behavior of the model, we conducted three experiments on each config and took the average. The mAP showed on the above chart are all average values.
+
+- We also conducted the same experiments in the original code, the results are showed below.
+
+  |    Backbone     | mAP@25 | mAP@50 |
+  | :-------------: | :----: | :----: |
+  |  NeRF-Det-R50   |  52.8  |  26.8  |
+  | NeRF-Det-R50\*  |  52.4  |  27.5  |
+  | NeRF-Det-R101\* |  52.8  |  28.6  |
+
+- Attention: Because of the randomness in the construction of the ScanNet dataset itself and the behavior of the model, the training results will fluctuate considerably. According to experimental results and experience, the experimental results will fluctuate by plus or minus 1.5 points.
+
+## Evaluation using pretrained models
+
+1. Download the pretrained checkpoints through the linkings in the above chart.
+
+2. Testing
+
+   To test, use:
+
+   ```bash
+   python tools/test.py projects/NeRF-Det/configs/nerfdet_res50_2x_low_res.py ${CHECKPOINT_PATH}
+   ```
+
+## Citation
+
+<!-- You may remove this section if not applicable. -->
+
+```latex
+@inproceedings{
+  xu2023nerfdet,
+  title={NeRF-Det: Learning Geometry-Aware Volumetric Representation for Multi-View 3D Object Detection},
+  author={Xu, Chenfeng and Wu, Bichen and Hou, Ji and Tsai, Sam and Li, Ruilong and Wang, Jialiang and Zhan, Wei and He, Zijian and Vajda, Peter and Keutzer, Kurt and Tomizuka, Masayoshi},
+  booktitle={ICCV},
+  year={2023},
+}
+
+@inproceedings{
+park2023time,
+title={Time Will Tell: New Outlooks and A Baseline for Temporal Multi-View 3D Object Detection},
+author={Jinhyung Park and Chenfeng Xu and Shijia Yang and Kurt Keutzer and Kris M. Kitani and Masayoshi Tomizuka and Wei Zhan},
+booktitle={The Eleventh International Conference on Learning Representations },
+year={2023},
+url={https://openreview.net/forum?id=H3HcEJA2Um}
+}
+```
--- a/projects/NeRF-Det/configs/nerfdet_res101_2x_low_res_depth.py
+++ b/projects/NeRF-Det/configs/nerfdet_res101_2x_low_res_depth.py
+_base_ = ['../../../configs/_base_/default_runtime.py']
+
+custom_imports = dict(imports=['projects.NeRF-Det.nerfdet'])
+prior_generator = dict(
+    type='AlignedAnchor3DRangeGenerator',
+    ranges=[[-3.2, -3.2, -1.28, 3.2, 3.2, 1.28]],
+    rotations=[.0])
+
+model = dict(
+    type='NerfDet',
+    data_preprocessor=dict(
+        type='NeRFDetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=10),
+    backbone=dict(
+        type='mmdet.ResNet',
+        depth=101,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'),
+        style='pytorch'),
+    neck=dict(
+        type='mmdet.FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=4),
+    neck_3d=dict(
+        type='IndoorImVoxelNeck',
+        in_channels=256,
+        out_channels=128,
+        n_blocks=[1, 1, 1]),
+    bbox_head=dict(
+        type='NerfDetHead',
+        bbox_loss=dict(type='AxisAlignedIoULoss', loss_weight=1.0),
+        n_classes=18,
+        n_levels=3,
+        n_channels=128,
+        n_reg_outs=6,
+        pts_assign_threshold=27,
+        pts_center_threshold=18,
+        prior_generator=prior_generator),
+    prior_generator=prior_generator,
+    voxel_size=[.16, .16, .2],
+    n_voxels=[40, 40, 16],
+    aabb=([-2.7, -2.7, -0.78], [3.7, 3.7, 1.78]),
+    near_far_range=[0.2, 8.0],
+    N_samples=64,
+    N_rand=2048,
+    nerf_mode='image',
+    depth_supervise=True,
+    use_nerf_mask=True,
+    nerf_sample_view=20,
+    squeeze_scale=4,
+    nerf_density=True,
+    train_cfg=dict(),
+    test_cfg=dict(nms_pre=1000, iou_thr=.25, score_thr=.01))
+
+dataset_type = 'MultiViewScanNetDataset'
+data_root = 'data/scannet/'
+class_names = [
+    'cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window', 'bookshelf',
+    'picture', 'counter', 'desk', 'curtain', 'refrigerator', 'showercurtrain',
+    'toilet', 'sink', 'bathtub', 'garbagebin'
+]
+metainfo = dict(CLASSES=class_names)
+file_client_args = dict(backend='disk')
+
+input_modality = dict(
+    use_camera=True,
+    use_depth=True,
+    use_lidar=False,
+    use_neuralrecon_depth=False,
+    use_ray=True)
+backend_args = None
+
+train_collect_keys = [
+    'img', 'gt_bboxes_3d', 'gt_labels_3d', 'depth', 'lightpos', 'nerf_sizes',
+    'raydirs', 'gt_images', 'gt_depths', 'denorm_images'
+]
+
+test_collect_keys = [
+    'img',
+    'depth',
+    'lightpos',
+    'nerf_sizes',
+    'raydirs',
+    'gt_images',
+    'gt_depths',
+    'denorm_images',
+]
+
+train_pipeline = [
+    dict(type='LoadAnnotations3D'),
+    dict(
+        type='MultiViewPipeline',
+        n_images=48,
+        transforms=[
+            dict(type='LoadImageFromFile', file_client_args=file_client_args),
+            dict(type='Resize', scale=(320, 240), keep_ratio=True),
+        ],
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        margin=10,
+        depth_range=[0.5, 5.5],
+        loading='random',
+        nerf_target_views=10),
+    dict(type='RandomShiftOrigin', std=(.7, .7, .0)),
+    dict(type='PackNeRFDetInputs', keys=train_collect_keys)
+]
+
+test_pipeline = [
+    dict(type='LoadAnnotations3D'),
+    dict(
+        type='MultiViewPipeline',
+        n_images=101,
+        transforms=[
+            dict(type='LoadImageFromFile', file_client_args=file_client_args),
+            dict(type='Resize', scale=(320, 240), keep_ratio=True),
+        ],
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        margin=10,
+        depth_range=[0.5, 5.5],
+        loading='random',
+        nerf_target_views=1),
+    dict(type='PackNeRFDetInputs', keys=test_collect_keys)
+]
+
+train_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=6,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='scannet_infos_train_new.pkl',
+            pipeline=train_pipeline,
+            modality=input_modality,
+            test_mode=False,
+            filter_empty_gt=True,
+            box_type_3d='Depth',
+            metainfo=metainfo)))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=5,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='scannet_infos_val_new.pkl',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        filter_empty_gt=True,
+        box_type_3d='Depth',
+        metainfo=metainfo))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IndoorMetric')
+test_evaluator = val_evaluator
+
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=1)
+test_cfg = dict()
+val_cfg = dict()
+
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.0001),
+    paramwise_cfg=dict(
+        custom_keys={'backbone': dict(lr_mult=0.1, decay_mult=1.0)}),
+    clip_grad=dict(max_norm=35., norm_type=2))
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=12,
+        by_epoch=True,
+        milestones=[8, 11],
+        gamma=0.1)
+]
+
+# hooks
+default_hooks = dict(
+    checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=12))
+
+# runtime
+find_unused_parameters = True  # only 1 of 4 FPN outputs is used
--- a/projects/NeRF-Det/configs/nerfdet_res50_2x_low_res.py
+++ b/projects/NeRF-Det/configs/nerfdet_res50_2x_low_res.py
+_base_ = ['./nerfdet_res50_2x_low_res_depth.py']
+
+model = dict(depth_supervise=False)
+
+dataset_type = 'MultiViewScanNetDataset'
+data_root = 'data/scannet/'
+class_names = [
+    'cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window', 'bookshelf',
+    'picture', 'counter', 'desk', 'curtain', 'refrigerator', 'showercurtrain',
+    'toilet', 'sink', 'bathtub', 'garbagebin'
+]
+metainfo = dict(CLASSES=class_names)
+file_client_args = dict(backend='disk')
+
+input_modality = dict(use_depth=False)
+backend_args = None
+
+train_collect_keys = [
+    'img', 'gt_bboxes_3d', 'gt_labels_3d', 'lightpos', 'nerf_sizes', 'raydirs',
+    'gt_images', 'gt_depths', 'denorm_images'
+]
+
+test_collect_keys = [
+    'img',
+    'lightpos',
+    'nerf_sizes',
+    'raydirs',
+    'gt_images',
+    'gt_depths',
+    'denorm_images',
+]
+
+train_pipeline = [
+    dict(type='LoadAnnotations3D'),
+    dict(
+        type='MultiViewPipeline',
+        n_images=50,
+        transforms=[
+            dict(type='LoadImageFromFile', file_client_args=file_client_args),
+            dict(type='Resize', scale=(320, 240), keep_ratio=True),
+        ],
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        margin=10,
+        depth_range=[0.5, 5.5],
+        loading='random',
+        nerf_target_views=10),
+    dict(type='RandomShiftOrigin', std=(.7, .7, .0)),
+    dict(type='PackNeRFDetInputs', keys=train_collect_keys)
+]
+
+test_pipeline = [
+    dict(type='LoadAnnotations3D'),
+    dict(
+        type='MultiViewPipeline',
+        n_images=101,
+        transforms=[
+            dict(type='LoadImageFromFile', file_client_args=file_client_args),
+            dict(type='Resize', scale=(320, 240), keep_ratio=True),
+        ],
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        margin=10,
+        depth_range=[0.5, 5.5],
+        loading='random',
+        nerf_target_views=1),
+    dict(type='PackNeRFDetInputs', keys=test_collect_keys)
+]
+
+train_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=6,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='scannet_infos_train_new.pkl',
+            pipeline=train_pipeline,
+            modality=input_modality,
+            test_mode=False,
+            filter_empty_gt=True,
+            box_type_3d='Depth',
+            metainfo=metainfo)))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='scannet_infos_val_new.pkl',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        filter_empty_gt=True,
+        box_type_3d='Depth',
+        metainfo=metainfo))
+test_dataloader = val_dataloader
--- a/projects/NeRF-Det/configs/nerfdet_res50_2x_low_res_depth.py
+++ b/projects/NeRF-Det/configs/nerfdet_res50_2x_low_res_depth.py
+_base_ = ['../../../configs/_base_/default_runtime.py']
+
+custom_imports = dict(imports=['projects.NeRF-Det.nerfdet'])
+prior_generator = dict(
+    type='AlignedAnchor3DRangeGenerator',
+    ranges=[[-3.2, -3.2, -1.28, 3.2, 3.2, 1.28]],
+    rotations=[.0])
+
+model = dict(
+    type='NerfDet',
+    data_preprocessor=dict(
+        type='NeRFDetDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=10),
+    backbone=dict(
+        type='mmdet.ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
+        style='pytorch'),
+    neck=dict(
+        type='mmdet.FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=4),
+    neck_3d=dict(
+        type='IndoorImVoxelNeck',
+        in_channels=256,
+        out_channels=128,
+        n_blocks=[1, 1, 1]),
+    bbox_head=dict(
+        type='NerfDetHead',
+        bbox_loss=dict(type='AxisAlignedIoULoss', loss_weight=1.0),
+        n_classes=18,
+        n_levels=3,
+        n_channels=128,
+        n_reg_outs=6,
+        pts_assign_threshold=27,
+        pts_center_threshold=18,
+        prior_generator=prior_generator),
+    prior_generator=prior_generator,
+    voxel_size=[.16, .16, .2],
+    n_voxels=[40, 40, 16],
+    aabb=([-2.7, -2.7, -0.78], [3.7, 3.7, 1.78]),
+    near_far_range=[0.2, 8.0],
+    N_samples=64,
+    N_rand=2048,
+    nerf_mode='image',
+    depth_supervise=True,
+    use_nerf_mask=True,
+    nerf_sample_view=20,
+    squeeze_scale=4,
+    nerf_density=True,
+    train_cfg=dict(),
+    test_cfg=dict(nms_pre=1000, iou_thr=.25, score_thr=.01))
+
+dataset_type = 'MultiViewScanNetDataset'
+data_root = 'data/scannet/'
+class_names = [
+    'cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window', 'bookshelf',
+    'picture', 'counter', 'desk', 'curtain', 'refrigerator', 'showercurtrain',
+    'toilet', 'sink', 'bathtub', 'garbagebin'
+]
+metainfo = dict(CLASSES=class_names)
+file_client_args = dict(backend='disk')
+
+input_modality = dict(
+    use_camera=True,
+    use_depth=True,
+    use_lidar=False,
+    use_neuralrecon_depth=False,
+    use_ray=True)
+backend_args = None
+
+train_collect_keys = [
+    'img', 'gt_bboxes_3d', 'gt_labels_3d', 'depth', 'lightpos', 'nerf_sizes',
+    'raydirs', 'gt_images', 'gt_depths', 'denorm_images'
+]
+
+test_collect_keys = [
+    'img',
+    'depth',
+    'lightpos',
+    'nerf_sizes',
+    'raydirs',
+    'gt_images',
+    'gt_depths',
+    'denorm_images',
+]
+
+train_pipeline = [
+    dict(type='LoadAnnotations3D'),
+    dict(
+        type='MultiViewPipeline',
+        n_images=50,
+        transforms=[
+            dict(type='LoadImageFromFile', file_client_args=file_client_args),
+            dict(type='Resize', scale=(320, 240), keep_ratio=True),
+        ],
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        margin=10,
+        depth_range=[0.5, 5.5],
+        loading='random',
+        nerf_target_views=10),
+    dict(type='RandomShiftOrigin', std=(.7, .7, .0)),
+    dict(type='PackNeRFDetInputs', keys=train_collect_keys)
+]
+
+test_pipeline = [
+    dict(type='LoadAnnotations3D'),
+    dict(
+        type='MultiViewPipeline',
+        n_images=101,
+        transforms=[
+            dict(type='LoadImageFromFile', file_client_args=file_client_args),
+            dict(type='Resize', scale=(320, 240), keep_ratio=True),
+        ],
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        margin=10,
+        depth_range=[0.5, 5.5],
+        loading='random',
+        nerf_target_views=1),
+    dict(type='PackNeRFDetInputs', keys=test_collect_keys)
+]
+
+train_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=6,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='scannet_infos_train_new.pkl',
+            pipeline=train_pipeline,
+            modality=input_modality,
+            test_mode=False,
+            filter_empty_gt=True,
+            box_type_3d='Depth',
+            metainfo=metainfo)))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=5,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='scannet_infos_val_new.pkl',
+        pipeline=test_pipeline,
+        modality=input_modality,
+        test_mode=True,
+        filter_empty_gt=True,
+        box_type_3d='Depth',
+        metainfo=metainfo))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IndoorMetric')
+test_evaluator = val_evaluator
+
+train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=1)
+test_cfg = dict()
+val_cfg = dict()
+
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.0001),
+    paramwise_cfg=dict(
+        custom_keys={'backbone': dict(lr_mult=0.1, decay_mult=1.0)}),
+    clip_grad=dict(max_norm=35., norm_type=2))
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=12,
+        by_epoch=True,
+        milestones=[8, 11],
+        gamma=0.1)
+]
+
+# hooks
+default_hooks = dict(
+    checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=12))
+
+# runtime
+find_unused_parameters = True  # only 1 of 4 FPN outputs is used
--- a/projects/NeRF-Det/nerfdet/__init__.py
+++ b/projects/NeRF-Det/nerfdet/__init__.py
+from .data_preprocessor import NeRFDetDataPreprocessor
+from .formating import PackNeRFDetInputs
+from .multiview_pipeline import MultiViewPipeline, RandomShiftOrigin
+from .nerfdet import NerfDet
+from .nerfdet_head import NerfDetHead
+from .scannet_multiview_dataset import MultiViewScanNetDataset
+
+__all__ = [
+    'MultiViewScanNetDataset', 'MultiViewPipeline', 'RandomShiftOrigin',
+    'PackNeRFDetInputs', 'NeRFDetDataPreprocessor', 'NerfDetHead', 'NerfDet'
+]
--- a/projects/NeRF-Det/nerfdet/data_preprocessor.py
+++ b/projects/NeRF-Det/nerfdet/data_preprocessor.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from numbers import Number
+from typing import Dict, List, Optional, Sequence, Tuple, Union
+
+import numpy as np
+import torch
+from mmdet.models import DetDataPreprocessor
+from mmdet.models.utils.misc import samplelist_boxtype2tensor
+from mmengine.model import stack_batch
+from mmengine.utils import is_seq_of
+from torch import Tensor
+from torch.nn import functional as F
+
+from mmdet3d.models.data_preprocessors.utils import multiview_img_stack_batch
+from mmdet3d.models.data_preprocessors.voxelize import (
+    VoxelizationByGridShape, dynamic_scatter_3d)
+from mmdet3d.registry import MODELS
+from mmdet3d.structures.det3d_data_sample import SampleList
+from mmdet3d.utils import OptConfigType
+
+
+@MODELS.register_module()
+class NeRFDetDataPreprocessor(DetDataPreprocessor):
+    """In NeRF-Det, some extra information is needed in NeRF branch. We put the
+    datapreprocessor operations of these new information such as stack and pack
+    operations in this class. You can find the stack operations in subfuction
+    'collate_data' and the pack operations in 'simple_process'. Other codes are
+    the same as the default class 'DetDataPreprocessor'.
+
+    Points / Image pre-processor for point clouds / vision-only / multi-
+    modality 3D detection tasks.
+
+    It provides the data pre-processing as follows
+
+    - Collate and move image and point cloud data to the target device.
+
+    - 1) For image data:
+
+      - Pad images in inputs to the maximum size of current batch with defined
+        ``pad_value``. The padding size can be divisible by a defined
+        ``pad_size_divisor``.
+      - Stack images in inputs to batch_imgs.
+      - Convert images in inputs from bgr to rgb if the shape of input is
+        (3, H, W).
+      - Normalize images in inputs with defined std and mean.
+      - Do batch augmentations during training.
+
+    - 2) For point cloud data:
+
+      - If no voxelization, directly return list of point cloud data.
+      - If voxelization is applied, voxelize point cloud according to
+        ``voxel_type`` and obtain ``voxels``.
+
+    Args:
+        voxel (bool): Whether to apply voxelization to point cloud.
+            Defaults to False.
+        voxel_type (str): Voxelization type. Two voxelization types are
+            provided: 'hard' and 'dynamic', respectively for hard voxelization
+            and dynamic voxelization. Defaults to 'hard'.
+        voxel_layer (dict or :obj:`ConfigDict`, optional): Voxelization layer
+            config. Defaults to None.
+        batch_first (bool): Whether to put the batch dimension to the first
+            dimension when getting voxel coordinates. Defaults to True.
+        max_voxels (int, optional): Maximum number of voxels in each voxel
+            grid. Defaults to None.
+        mean (Sequence[Number], optional): The pixel mean of R, G, B channels.
+            Defaults to None.
+        std (Sequence[Number], optional): The pixel standard deviation of
+            R, G, B channels. Defaults to None.
+        pad_size_divisor (int): The size of padded image should be divisible by
+            ``pad_size_divisor``. Defaults to 1.
+        pad_value (float or int): The padded pixel value. Defaults to 0.
+        pad_mask (bool): Whether to pad instance masks. Defaults to False.
+        mask_pad_value (int): The padded pixel value for instance masks.
+            Defaults to 0.
+        pad_seg (bool): Whether to pad semantic segmentation maps.
+            Defaults to False.
+        seg_pad_value (int): The padded pixel value for semantic segmentation
+            maps. Defaults to 255.
+        bgr_to_rgb (bool): Whether to convert image from BGR to RGB.
+            Defaults to False.
+        rgb_to_bgr (bool): Whether to convert image from RGB to BGR.
+            Defaults to False.
+        boxtype2tensor (bool): Whether to convert the ``BaseBoxes`` type of
+            bboxes data to ``Tensor`` type. Defaults to True.
+        non_blocking (bool): Whether to block current process when transferring
+            data to device. Defaults to False.
+        batch_augments (List[dict], optional): Batch-level augmentations.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 voxel: bool = False,
+                 voxel_type: str = 'hard',
+                 voxel_layer: OptConfigType = None,
+                 batch_first: bool = True,
+                 max_voxels: Optional[int] = None,
+                 mean: Sequence[Number] = None,
+                 std: Sequence[Number] = None,
+                 pad_size_divisor: int = 1,
+                 pad_value: Union[float, int] = 0,
+                 pad_mask: bool = False,
+                 mask_pad_value: int = 0,
+                 pad_seg: bool = False,
+                 seg_pad_value: int = 255,
+                 bgr_to_rgb: bool = False,
+                 rgb_to_bgr: bool = False,
+                 boxtype2tensor: bool = True,
+                 non_blocking: bool = False,
+                 batch_augments: Optional[List[dict]] = None) -> None:
+        super(NeRFDetDataPreprocessor, self).__init__(
+            mean=mean,
+            std=std,
+            pad_size_divisor=pad_size_divisor,
+            pad_value=pad_value,
+            pad_mask=pad_mask,
+            mask_pad_value=mask_pad_value,
+            pad_seg=pad_seg,
+            seg_pad_value=seg_pad_value,
+            bgr_to_rgb=bgr_to_rgb,
+            rgb_to_bgr=rgb_to_bgr,
+            boxtype2tensor=boxtype2tensor,
+            non_blocking=non_blocking,
+            batch_augments=batch_augments)
+        self.voxel = voxel
+        self.voxel_type = voxel_type
+        self.batch_first = batch_first
+        self.max_voxels = max_voxels
+        if voxel:
+            self.voxel_layer = VoxelizationByGridShape(**voxel_layer)
+
+    def forward(self,
+                data: Union[dict, List[dict]],
+                training: bool = False) -> Union[dict, List[dict]]:
+        """Perform normalization, padding and bgr2rgb conversion based on
+        ``BaseDataPreprocessor``.
+
+        Args:
+            data (dict or List[dict]): Data from dataloader. The dict contains
+                the whole batch data, when it is a list[dict], the list
+                indicates test time augmentation.
+            training (bool): Whether to enable training time augmentation.
+                Defaults to False.
+
+        Returns:
+            dict or List[dict]: Data in the same format as the model input.
+        """
+        if isinstance(data, list):
+            num_augs = len(data)
+            aug_batch_data = []
+            for aug_id in range(num_augs):
+                single_aug_batch_data = self.simple_process(
+                    data[aug_id], training)
+                aug_batch_data.append(single_aug_batch_data)
+            return aug_batch_data
+
+        else:
+            return self.simple_process(data, training)
+
+    def simple_process(self, data: dict, training: bool = False) -> dict:
+        """Perform normalization, padding and bgr2rgb conversion for img data
+        based on ``BaseDataPreprocessor``, and voxelize point cloud if `voxel`
+        is set to be True.
+
+        Args:
+            data (dict): Data sampled from dataloader.
+            training (bool): Whether to enable training time augmentation.
+                Defaults to False.
+
+        Returns:
+            dict: Data in the same format as the model input.
+        """
+        if 'img' in data['inputs']:
+            batch_pad_shape = self._get_pad_shape(data)
+
+        data = self.collate_data(data)
+        inputs, data_samples = data['inputs'], data['data_samples']
+        batch_inputs = dict()
+
+        if 'points' in inputs:
+            batch_inputs['points'] = inputs['points']
+
+            if self.voxel:
+                voxel_dict = self.voxelize(inputs['points'], data_samples)
+                batch_inputs['voxels'] = voxel_dict
+
+        if 'imgs' in inputs:
+            imgs = inputs['imgs']
+
+            if data_samples is not None:
+                # NOTE the batched image size information may be useful, e.g.
+                # in DETR, this is needed for the construction of masks, which
+                # is then used for the transformer_head.
+                batch_input_shape = tuple(imgs[0].size()[-2:])
+                for data_sample, pad_shape in zip(data_samples,
+                                                  batch_pad_shape):
+                    data_sample.set_metainfo({
+                        'batch_input_shape': batch_input_shape,
+                        'pad_shape': pad_shape
+                    })
+
+                if self.boxtype2tensor:
+                    samplelist_boxtype2tensor(data_samples)
+                if self.pad_mask:
+                    self.pad_gt_masks(data_samples)
+                if self.pad_seg:
+                    self.pad_gt_sem_seg(data_samples)
+
+            if training and self.batch_augments is not None:
+                for batch_aug in self.batch_augments:
+                    imgs, data_samples = batch_aug(imgs, data_samples)
+            batch_inputs['imgs'] = imgs
+        # Hard code here, will be changed later.
+        # if len(inputs['depth']) != 0:
+        if 'depth' in inputs.keys():
+            batch_inputs['depth'] = inputs['depth']
+        batch_inputs['lightpos'] = inputs['lightpos']
+        batch_inputs['nerf_sizes'] = inputs['nerf_sizes']
+        batch_inputs['denorm_images'] = inputs['denorm_images']
+        batch_inputs['raydirs'] = inputs['raydirs']
+
+        return {'inputs': batch_inputs, 'data_samples': data_samples}
+
+    def preprocess_img(self, _batch_img: Tensor) -> Tensor:
+        # channel transform
+        if self._channel_conversion:
+            _batch_img = _batch_img[[2, 1, 0], ...]
+        # Convert to float after channel conversion to ensure
+        # efficiency
+        _batch_img = _batch_img.float()
+        # Normalization.
+        if self._enable_normalize:
+            if self.mean.shape[0] == 3:
+                assert _batch_img.dim() == 3 and _batch_img.shape[0] == 3, (
+                    'If the mean has 3 values, the input tensor '
+                    'should in shape of (3, H, W), but got the '
+                    f'tensor with shape {_batch_img.shape}')
+            _batch_img = (_batch_img - self.mean) / self.std
+        return _batch_img
+
+    def collate_data(self, data: dict) -> dict:
+        """Copy data to the target device and perform normalization, padding
+        and bgr2rgb conversion and stack based on ``BaseDataPreprocessor``.
+
+        Collates the data sampled from dataloader into a list of dict and list
+        of labels, and then copies tensor to the target device.
+
+        Args:
+            data (dict): Data sampled from dataloader.
+
+        Returns:
+            dict: Data in the same format as the model input.
+        """
+        data = self.cast_data(data)  # type: ignore
+
+        if 'img' in data['inputs']:
+            _batch_imgs = data['inputs']['img']
+            # Process data with `pseudo_collate`.
+            if is_seq_of(_batch_imgs, torch.Tensor):
+                batch_imgs = []
+                img_dim = _batch_imgs[0].dim()
+                for _batch_img in _batch_imgs:
+                    if img_dim == 3:  # standard img
+                        _batch_img = self.preprocess_img(_batch_img)
+                    elif img_dim == 4:
+                        _batch_img = [
+                            self.preprocess_img(_img) for _img in _batch_img
+                        ]
+
+                        _batch_img = torch.stack(_batch_img, dim=0)
+
+                    batch_imgs.append(_batch_img)
+
+                # Pad and stack Tensor.
+                if img_dim == 3:
+                    batch_imgs = stack_batch(batch_imgs, self.pad_size_divisor,
+                                             self.pad_value)
+                elif img_dim == 4:
+                    batch_imgs = multiview_img_stack_batch(
+                        batch_imgs, self.pad_size_divisor, self.pad_value)
+
+            # Process data with `default_collate`.
+            elif isinstance(_batch_imgs, torch.Tensor):
+                assert _batch_imgs.dim() == 4, (
+                    'The input of `ImgDataPreprocessor` should be a NCHW '
+                    'tensor or a list of tensor, but got a tensor with '
+                    f'shape: {_batch_imgs.shape}')
+                if self._channel_conversion:
+                    _batch_imgs = _batch_imgs[:, [2, 1, 0], ...]
+                # Convert to float after channel conversion to ensure
+                # efficiency
+                _batch_imgs = _batch_imgs.float()
+                if self._enable_normalize:
+                    _batch_imgs = (_batch_imgs - self.mean) / self.std
+                h, w = _batch_imgs.shape[2:]
+                target_h = math.ceil(
+                    h / self.pad_size_divisor) * self.pad_size_divisor
+                target_w = math.ceil(
+                    w / self.pad_size_divisor) * self.pad_size_divisor
+                pad_h = target_h - h
+                pad_w = target_w - w
+                batch_imgs = F.pad(_batch_imgs, (0, pad_w, 0, pad_h),
+                                   'constant', self.pad_value)
+            else:
+                raise TypeError(
+                    'Output of `cast_data` should be a list of dict '
+                    'or a tuple with inputs and data_samples, but got '
+                    f'{type(data)}: {data}')
+
+            data['inputs']['imgs'] = batch_imgs
+        if 'raydirs' in data['inputs']:
+            _batch_dirs = data['inputs']['raydirs']
+            batch_dirs = stack_batch(_batch_dirs)
+            data['inputs']['raydirs'] = batch_dirs
+
+        if 'lightpos' in data['inputs']:
+            _batch_poses = data['inputs']['lightpos']
+            batch_poses = stack_batch(_batch_poses)
+            data['inputs']['lightpos'] = batch_poses
+
+        if 'denorm_images' in data['inputs']:
+            _batch_denorm_imgs = data['inputs']['denorm_images']
+            # Process data with `pseudo_collate`.
+            if is_seq_of(_batch_denorm_imgs, torch.Tensor):
+                denorm_img_dim = _batch_denorm_imgs[0].dim()
+                # Pad and stack Tensor.
+                if denorm_img_dim == 3:
+                    batch_denorm_imgs = stack_batch(_batch_denorm_imgs,
+                                                    self.pad_size_divisor,
+                                                    self.pad_value)
+                elif denorm_img_dim == 4:
+                    batch_denorm_imgs = multiview_img_stack_batch(
+                        _batch_denorm_imgs, self.pad_size_divisor,
+                        self.pad_value)
+            data['inputs']['denorm_images'] = batch_denorm_imgs
+
+        data.setdefault('data_samples', None)
+
+        return data
+
+    def _get_pad_shape(self, data: dict) -> List[Tuple[int, int]]:
+        """Get the pad_shape of each image based on data and
+        pad_size_divisor."""
+        # rewrite `_get_pad_shape` for obtaining image inputs.
+        _batch_inputs = data['inputs']['img']
+        # Process data with `pseudo_collate`.
+        if is_seq_of(_batch_inputs, torch.Tensor):
+            batch_pad_shape = []
+            for ori_input in _batch_inputs:
+                if ori_input.dim() == 4:
+                    # mean multiview input, select one of the
+                    # image to calculate the pad shape
+                    ori_input = ori_input[0]
+                pad_h = int(
+                    np.ceil(ori_input.shape[1] /
+                            self.pad_size_divisor)) * self.pad_size_divisor
+                pad_w = int(
+                    np.ceil(ori_input.shape[2] /
+                            self.pad_size_divisor)) * self.pad_size_divisor
+                batch_pad_shape.append((pad_h, pad_w))
+        # Process data with `default_collate`.
+        elif isinstance(_batch_inputs, torch.Tensor):
+            assert _batch_inputs.dim() == 4, (
+                'The input of `ImgDataPreprocessor` should be a NCHW tensor '
+                'or a list of tensor, but got a tensor with shape: '
+                f'{_batch_inputs.shape}')
+            pad_h = int(
+                np.ceil(_batch_inputs.shape[1] /
+                        self.pad_size_divisor)) * self.pad_size_divisor
+            pad_w = int(
+                np.ceil(_batch_inputs.shape[2] /
+                        self.pad_size_divisor)) * self.pad_size_divisor
+            batch_pad_shape = [(pad_h, pad_w)] * _batch_inputs.shape[0]
+        else:
+            raise TypeError('Output of `cast_data` should be a list of dict '
+                            'or a tuple with inputs and data_samples, but got '
+                            f'{type(data)}: {data}')
+        return batch_pad_shape
+
+    @torch.no_grad()
+    def voxelize(self, points: List[Tensor],
+                 data_samples: SampleList) -> Dict[str, Tensor]:
+        """Apply voxelization to point cloud.
+
+        Args:
+            points (List[Tensor]): Point cloud in one data batch.
+            data_samples: (list[:obj:`NeRFDet3DDataSample`]): The annotation
+                data of every samples. Add voxel-wise annotation for
+                segmentation.
+
+        Returns:
+            Dict[str, Tensor]: Voxelization information.
+
+            - voxels (Tensor): Features of voxels, shape is MxNxC for hard
+              voxelization, NxC for dynamic voxelization.
+            - coors (Tensor): Coordinates of voxels, shape is Nx(1+NDim),
+              where 1 represents the batch index.
+            - num_points (Tensor, optional): Number of points in each voxel.
+            - voxel_centers (Tensor, optional): Centers of voxels.
+        """
+
+        voxel_dict = dict()
+
+        if self.voxel_type == 'hard':
+            voxels, coors, num_points, voxel_centers = [], [], [], []
+            for i, res in enumerate(points):
+                res_voxels, res_coors, res_num_points = self.voxel_layer(res)
+                res_voxel_centers = (
+                    res_coors[:, [2, 1, 0]] + 0.5) * res_voxels.new_tensor(
+                        self.voxel_layer.voxel_size) + res_voxels.new_tensor(
+                            self.voxel_layer.point_cloud_range[0:3])
+                res_coors = F.pad(res_coors, (1, 0), mode='constant', value=i)
+                voxels.append(res_voxels)
+                coors.append(res_coors)
+                num_points.append(res_num_points)
+                voxel_centers.append(res_voxel_centers)
+
+            voxels = torch.cat(voxels, dim=0)
+            coors = torch.cat(coors, dim=0)
+            num_points = torch.cat(num_points, dim=0)
+            voxel_centers = torch.cat(voxel_centers, dim=0)
+
+            voxel_dict['num_points'] = num_points
+            voxel_dict['voxel_centers'] = voxel_centers
+        elif self.voxel_type == 'dynamic':
+            coors = []
+            # dynamic voxelization only provide a coors mapping
+            for i, res in enumerate(points):
+                res_coors = self.voxel_layer(res)
+                res_coors = F.pad(res_coors, (1, 0), mode='constant', value=i)
+                coors.append(res_coors)
+            voxels = torch.cat(points, dim=0)
+            coors = torch.cat(coors, dim=0)
+        elif self.voxel_type == 'cylindrical':
+            voxels, coors = [], []
+            for i, (res, data_sample) in enumerate(zip(points, data_samples)):
+                rho = torch.sqrt(res[:, 0]**2 + res[:, 1]**2)
+                phi = torch.atan2(res[:, 1], res[:, 0])
+                polar_res = torch.stack((rho, phi, res[:, 2]), dim=-1)
+                min_bound = polar_res.new_tensor(
+                    self.voxel_layer.point_cloud_range[:3])
+                max_bound = polar_res.new_tensor(
+                    self.voxel_layer.point_cloud_range[3:])
+                try:  # only support PyTorch >= 1.9.0
+                    polar_res_clamp = torch.clamp(polar_res, min_bound,
+                                                  max_bound)
+                except TypeError:
+                    polar_res_clamp = polar_res.clone()
+                    for coor_idx in range(3):
+                        polar_res_clamp[:, coor_idx][
+                            polar_res[:, coor_idx] >
+                            max_bound[coor_idx]] = max_bound[coor_idx]
+                        polar_res_clamp[:, coor_idx][
+                            polar_res[:, coor_idx] <
+                            min_bound[coor_idx]] = min_bound[coor_idx]
+                res_coors = torch.floor(
+                    (polar_res_clamp - min_bound) / polar_res_clamp.new_tensor(
+                        self.voxel_layer.voxel_size)).int()
+                self.get_voxel_seg(res_coors, data_sample)
+                res_coors = F.pad(res_coors, (1, 0), mode='constant', value=i)
+                res_voxels = torch.cat((polar_res, res[:, :2], res[:, 3:]),
+                                       dim=-1)
+                voxels.append(res_voxels)
+                coors.append(res_coors)
+            voxels = torch.cat(voxels, dim=0)
+            coors = torch.cat(coors, dim=0)
+        elif self.voxel_type == 'minkunet':
+            voxels, coors = [], []
+            voxel_size = points[0].new_tensor(self.voxel_layer.voxel_size)
+            for i, (res, data_sample) in enumerate(zip(points, data_samples)):
+                res_coors = torch.round(res[:, :3] / voxel_size).int()
+                res_coors -= res_coors.min(0)[0]
+
+                res_coors_numpy = res_coors.cpu().numpy()
+                inds, point2voxel_map = self.sparse_quantize(
+                    res_coors_numpy, return_index=True, return_inverse=True)
+                point2voxel_map = torch.from_numpy(point2voxel_map).cuda()
+                if self.training and self.max_voxels is not None:
+                    if len(inds) > self.max_voxels:
+                        inds = np.random.choice(
+                            inds, self.max_voxels, replace=False)
+                inds = torch.from_numpy(inds).cuda()
+                if hasattr(data_sample.gt_pts_seg, 'pts_semantic_mask'):
+                    data_sample.gt_pts_seg.voxel_semantic_mask \
+                        = data_sample.gt_pts_seg.pts_semantic_mask[inds]
+                res_voxel_coors = res_coors[inds]
+                res_voxels = res[inds]
+                if self.batch_first:
+                    res_voxel_coors = F.pad(
+                        res_voxel_coors, (1, 0), mode='constant', value=i)
+                    data_sample.batch_idx = res_voxel_coors[:, 0]
+                else:
+                    res_voxel_coors = F.pad(
+                        res_voxel_coors, (0, 1), mode='constant', value=i)
+                    data_sample.batch_idx = res_voxel_coors[:, -1]
+                data_sample.point2voxel_map = point2voxel_map.long()
+                voxels.append(res_voxels)
+                coors.append(res_voxel_coors)
+            voxels = torch.cat(voxels, dim=0)
+            coors = torch.cat(coors, dim=0)
+
+        else:
+            raise ValueError(f'Invalid voxelization type {self.voxel_type}')
+
+        voxel_dict['voxels'] = voxels
+        voxel_dict['coors'] = coors
+
+        return voxel_dict
+
+    def get_voxel_seg(self, res_coors: Tensor,
+                      data_sample: SampleList) -> None:
+        """Get voxel-wise segmentation label and point2voxel map.
+
+        Args:
+            res_coors (Tensor): The voxel coordinates of points, Nx3.
+            data_sample: (:obj:`NeRFDet3DDataSample`): The annotation data of
+                every samples. Add voxel-wise annotation forsegmentation.
+        """
+
+        if self.training:
+            pts_semantic_mask = data_sample.gt_pts_seg.pts_semantic_mask
+            voxel_semantic_mask, _, point2voxel_map = dynamic_scatter_3d(
+                F.one_hot(pts_semantic_mask.long()).float(), res_coors, 'mean',
+                True)
+            voxel_semantic_mask = torch.argmax(voxel_semantic_mask, dim=-1)
+            data_sample.gt_pts_seg.voxel_semantic_mask = voxel_semantic_mask
+            data_sample.point2voxel_map = point2voxel_map
+        else:
+            pseudo_tensor = res_coors.new_ones([res_coors.shape[0], 1]).float()
+            _, _, point2voxel_map = dynamic_scatter_3d(pseudo_tensor,
+                                                       res_coors, 'mean', True)
+            data_sample.point2voxel_map = point2voxel_map
+
+    def ravel_hash(self, x: np.ndarray) -> np.ndarray:
+        """Get voxel coordinates hash for np.unique.
+
+        Args:
+            x (np.ndarray): The voxel coordinates of points, Nx3.
+
+        Returns:
+            np.ndarray: Voxels coordinates hash.
+        """
+        assert x.ndim == 2, x.shape
+
+        x = x - np.min(x, axis=0)
+        x = x.astype(np.uint64, copy=False)
+        xmax = np.max(x, axis=0).astype(np.uint64) + 1
+
+        h = np.zeros(x.shape[0], dtype=np.uint64)
+        for k in range(x.shape[1] - 1):
+            h += x[:, k]
+            h *= xmax[k + 1]
+        h += x[:, -1]
+        return h
+
+    def sparse_quantize(self,
+                        coords: np.ndarray,
+                        return_index: bool = False,
+                        return_inverse: bool = False) -> List[np.ndarray]:
+        """Sparse Quantization for voxel coordinates used in Minkunet.
+
+        Args:
+            coords (np.ndarray): The voxel coordinates of points, Nx3.
+            return_index (bool): Whether to return the indices of the unique
+                coords, shape (M,).
+            return_inverse (bool): Whether to return the indices of the
+                original coords, shape (N,).
+
+        Returns:
+            List[np.ndarray]: Return index and inverse map if return_index and
+            return_inverse is True.
+        """
+        _, indices, inverse_indices = np.unique(
+            self.ravel_hash(coords), return_index=True, return_inverse=True)
+        coords = coords[indices]
+
+        outputs = []
+        if return_index:
+            outputs += [indices]
+        if return_inverse:
+            outputs += [inverse_indices]
+        return outputs
--- a/projects/NeRF-Det/nerfdet/formating.py
+++ b/projects/NeRF-Det/nerfdet/formating.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Sequence, Union
+
+import mmengine
+import numpy as np
+import torch
+from mmcv import BaseTransform
+from mmengine.structures import InstanceData
+from numpy import dtype
+
+from mmdet3d.registry import TRANSFORMS
+from mmdet3d.structures import BaseInstance3DBoxes, PointData
+from mmdet3d.structures.points import BasePoints
+# from .det3d_data_sample import Det3DDataSample
+from .nerf_det3d_data_sample import NeRFDet3DDataSample
+
+
+def to_tensor(
+    data: Union[torch.Tensor, np.ndarray, Sequence, int,
+                float]) -> torch.Tensor:
+    """Convert objects of various python types to :obj:`torch.Tensor`.
+
+    Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
+    :class:`Sequence`, :class:`int` and :class:`float`.
+
+    Args:
+        data (torch.Tensor | numpy.ndarray | Sequence | int | float): Data to
+            be converted.
+
+    Returns:
+        torch.Tensor: the converted data.
+    """
+
+    if isinstance(data, torch.Tensor):
+        return data
+    elif isinstance(data, np.ndarray):
+        if data.dtype is dtype('float64'):
+            data = data.astype(np.float32)
+        return torch.from_numpy(data)
+    elif isinstance(data, Sequence) and not mmengine.is_str(data):
+        return torch.tensor(data)
+    elif isinstance(data, int):
+        return torch.LongTensor([data])
+    elif isinstance(data, float):
+        return torch.FloatTensor([data])
+    else:
+        raise TypeError(f'type {type(data)} cannot be converted to tensor.')
+
+
+@TRANSFORMS.register_module()
+class PackNeRFDetInputs(BaseTransform):
+    INPUTS_KEYS = ['points', 'img']
+    NERF_INPUT_KEYS = [
+        'img', 'denorm_images', 'depth', 'lightpos', 'nerf_sizes', 'raydirs'
+    ]
+
+    INSTANCEDATA_3D_KEYS = [
+        'gt_bboxes_3d', 'gt_labels_3d', 'attr_labels', 'depths', 'centers_2d'
+    ]
+    INSTANCEDATA_2D_KEYS = [
+        'gt_bboxes',
+        'gt_bboxes_labels',
+    ]
+    NERF_3D_KEYS = ['gt_images', 'gt_depths']
+
+    SEG_KEYS = [
+        'gt_seg_map', 'pts_instance_mask', 'pts_semantic_mask',
+        'gt_semantic_seg'
+    ]
+
+    def __init__(
+        self,
+        keys: tuple,
+        meta_keys: tuple = ('img_path', 'ori_shape', 'img_shape', 'lidar2img',
+                            'depth2img', 'cam2img', 'pad_shape',
+                            'scale_factor', 'flip', 'pcd_horizontal_flip',
+                            'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d',
+                            'img_norm_cfg', 'num_pts_feats', 'pcd_trans',
+                            'sample_idx', 'pcd_scale_factor', 'pcd_rotation',
+                            'pcd_rotation_angle', 'lidar_path',
+                            'transformation_3d_flow', 'trans_mat',
+                            'affine_aug', 'sweep_img_metas', 'ori_cam2img',
+                            'cam2global', 'crop_offset', 'img_crop_offset',
+                            'resize_img_shape', 'lidar2cam', 'ori_lidar2img',
+                            'num_ref_frames', 'num_views', 'ego2global',
+                            'axis_align_matrix')
+    ) -> None:
+        self.keys = keys
+        self.meta_keys = meta_keys
+
+    def _remove_prefix(self, key: str) -> str:
+        if key.startswith('gt_'):
+            key = key[3:]
+        return key
+
+    def transform(self, results: Union[dict,
+                                       List[dict]]) -> Union[dict, List[dict]]:
+        """Method to pack the input data. when the value in this dict is a
+        list, it usually is in Augmentations Testing.
+
+        Args:
+            results (dict | list[dict]): Result dict from the data pipeline.
+
+        Returns:
+            dict | List[dict]:
+
+            - 'inputs' (dict): The forward data of models. It usually contains
+              following keys:
+
+                - points
+                - img
+
+            - 'data_samples' (:obj:`NeRFDet3DDataSample`): The annotation info
+              of the sample.
+        """
+        # augtest
+        if isinstance(results, list):
+            if len(results) == 1:
+                # simple test
+                return self.pack_single_results(results[0])
+            pack_results = []
+            for single_result in results:
+                pack_results.append(self.pack_single_results(single_result))
+            return pack_results
+        # norm training and simple testing
+        elif isinstance(results, dict):
+            return self.pack_single_results(results)
+        else:
+            raise NotImplementedError
+
+    def pack_single_results(self, results: dict) -> dict:
+        """Method to pack the single input data. when the value in this dict is
+        a list, it usually is in Augmentations Testing.
+
+        Args:
+            results (dict): Result dict from the data pipeline.
+
+        Returns:
+            dict: A dict contains
+
+            - 'inputs' (dict): The forward data of models. It usually contains
+              following keys:
+
+                - points
+                - img
+
+            - 'data_samples' (:obj:`NeRFDet3DDataSample`): The annotation info
+              of the sample.
+        """
+        # Format 3D data
+        if 'points' in results:
+            if isinstance(results['points'], BasePoints):
+                results['points'] = results['points'].tensor
+
+        if 'img' in results:
+            if isinstance(results['img'], list):
+                # process multiple imgs in single frame
+                imgs = np.stack(results['img'], axis=0)
+                if imgs.flags.c_contiguous:
+                    imgs = to_tensor(imgs).permute(0, 3, 1, 2).contiguous()
+                else:
+                    imgs = to_tensor(
+                        np.ascontiguousarray(imgs.transpose(0, 3, 1, 2)))
+                results['img'] = imgs
+            else:
+                img = results['img']
+                if len(img.shape) < 3:
+                    img = np.expand_dims(img, -1)
+                # To improve the computational speed by by 3-5 times, apply:
+                # `torch.permute()` rather than `np.transpose()`.
+                # Refer to https://github.com/open-mmlab/mmdetection/pull/9533
+                # for more details
+                if img.flags.c_contiguous:
+                    img = to_tensor(img).permute(2, 0, 1).contiguous()
+                else:
+                    img = to_tensor(
+                        np.ascontiguousarray(img.transpose(2, 0, 1)))
+                results['img'] = img
+
+        if 'depth' in results:
+            if isinstance(results['depth'], list):
+                # process multiple depth imgs in single frame
+                depth_imgs = np.stack(results['depth'], axis=0)
+                if depth_imgs.flags.c_contiguous:
+                    depth_imgs = to_tensor(depth_imgs).contiguous()
+                else:
+                    depth_imgs = to_tensor(np.ascontiguousarray(depth_imgs))
+                results['depth'] = depth_imgs
+            else:
+                depth_img = results['depth']
+                if len(depth_img.shape) < 3:
+                    depth_img = np.expand_dims(depth_img, -1)
+                if depth_img.flags.c_contiguous:
+                    depth_img = to_tensor(depth_img).contiguous()
+                else:
+                    depth_img = to_tensor(np.ascontiguousarray(depth_img))
+                results['depth'] = depth_img
+
+        if 'ray_info' in results:
+            if isinstance(results['raydirs'], list):
+                raydirs = np.stack(results['raydirs'], axis=0)
+                if raydirs.flags.c_contiguous:
+                    raydirs = to_tensor(raydirs).contiguous()
+                else:
+                    raydirs = to_tensor(np.ascontiguousarray(raydirs))
+                results['raydirs'] = raydirs
+
+            if isinstance(results['lightpos'], list):
+                lightposes = np.stack(results['lightpos'], axis=0)
+                if lightposes.flags.c_contiguous:
+                    lightposes = to_tensor(lightposes).contiguous()
+                else:
+                    lightposes = to_tensor(np.ascontiguousarray(lightposes))
+                lightposes = lightposes.unsqueeze(1).repeat(
+                    1, raydirs.shape[1], 1)
+                results['lightpos'] = lightposes
+
+            if isinstance(results['gt_images'], list):
+                gt_images = np.stack(results['gt_images'], axis=0)
+                if gt_images.flags.c_contiguous:
+                    gt_images = to_tensor(gt_images).contiguous()
+                else:
+                    gt_images = to_tensor(np.ascontiguousarray(gt_images))
+                results['gt_images'] = gt_images
+
+            if isinstance(results['gt_depths'],
+                          list) and len(results['gt_depths']) != 0:
+                gt_depths = np.stack(results['gt_depths'], axis=0)
+                if gt_depths.flags.c_contiguous:
+                    gt_depths = to_tensor(gt_depths).contiguous()
+                else:
+                    gt_depths = to_tensor(np.ascontiguousarray(gt_depths))
+                results['gt_depths'] = gt_depths
+
+            if isinstance(results['denorm_images'], list):
+                denorm_imgs = np.stack(results['denorm_images'], axis=0)
+                if denorm_imgs.flags.c_contiguous:
+                    denorm_imgs = to_tensor(denorm_imgs).permute(
+                        0, 3, 1, 2).contiguous()
+                else:
+                    denorm_imgs = to_tensor(
+                        np.ascontiguousarray(
+                            denorm_imgs.transpose(0, 3, 1, 2)))
+                results['denorm_images'] = denorm_imgs
+
+        for key in [
+                'proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_labels',
+                'gt_bboxes_labels', 'attr_labels', 'pts_instance_mask',
+                'pts_semantic_mask', 'centers_2d', 'depths', 'gt_labels_3d'
+        ]:
+            if key not in results:
+                continue
+            if isinstance(results[key], list):
+                results[key] = [to_tensor(res) for res in results[key]]
+            else:
+                results[key] = to_tensor(results[key])
+        if 'gt_bboxes_3d' in results:
+            if not isinstance(results['gt_bboxes_3d'], BaseInstance3DBoxes):
+                results['gt_bboxes_3d'] = to_tensor(results['gt_bboxes_3d'])
+
+        if 'gt_semantic_seg' in results:
+            results['gt_semantic_seg'] = to_tensor(
+                results['gt_semantic_seg'][None])
+        if 'gt_seg_map' in results:
+            results['gt_seg_map'] = results['gt_seg_map'][None, ...]
+
+        if 'gt_images' in results:
+            results['gt_images'] = to_tensor(results['gt_images'])
+        if 'gt_depths' in results:
+            results['gt_depths'] = to_tensor(results['gt_depths'])
+
+        data_sample = NeRFDet3DDataSample()
+        gt_instances_3d = InstanceData()
+        gt_instances = InstanceData()
+        gt_pts_seg = PointData()
+        gt_nerf_images = InstanceData()
+        gt_nerf_depths = InstanceData()
+
+        data_metas = {}
+        for key in self.meta_keys:
+            if key in results:
+                data_metas[key] = results[key]
+            elif 'images' in results:
+                if len(results['images'].keys()) == 1:
+                    cam_type = list(results['images'].keys())[0]
+                    # single-view image
+                    if key in results['images'][cam_type]:
+                        data_metas[key] = results['images'][cam_type][key]
+                else:
+                    # multi-view image
+                    img_metas = []
+                    cam_types = list(results['images'].keys())
+                    for cam_type in cam_types:
+                        if key in results['images'][cam_type]:
+                            img_metas.append(results['images'][cam_type][key])
+                    if len(img_metas) > 0:
+                        data_metas[key] = img_metas
+            elif 'lidar_points' in results:
+                if key in results['lidar_points']:
+                    data_metas[key] = results['lidar_points'][key]
+        data_sample.set_metainfo(data_metas)
+
+        inputs = {}
+        for key in self.keys:
+            if key in results:
+                # if key in self.INPUTS_KEYS:
+                if key in self.NERF_INPUT_KEYS:
+                    inputs[key] = results[key]
+                elif key in self.NERF_3D_KEYS:
+                    if key == 'gt_images':
+                        gt_nerf_images[self._remove_prefix(key)] = results[key]
+                    else:
+                        gt_nerf_depths[self._remove_prefix(key)] = results[key]
+                elif key in self.INSTANCEDATA_3D_KEYS:
+                    gt_instances_3d[self._remove_prefix(key)] = results[key]
+                elif key in self.INSTANCEDATA_2D_KEYS:
+                    if key == 'gt_bboxes_labels':
+                        gt_instances['labels'] = results[key]
+                    else:
+                        gt_instances[self._remove_prefix(key)] = results[key]
+                elif key in self.SEG_KEYS:
+                    gt_pts_seg[self._remove_prefix(key)] = results[key]
+                else:
+                    raise NotImplementedError(f'Please modified '
+                                              f'`Pack3DDetInputs` '
+                                              f'to put {key} to '
+                                              f'corresponding field')
+
+        data_sample.gt_instances_3d = gt_instances_3d
+        data_sample.gt_instances = gt_instances
+        data_sample.gt_pts_seg = gt_pts_seg
+        data_sample.gt_nerf_images = gt_nerf_images
+        data_sample.gt_nerf_depths = gt_nerf_depths
+        if 'eval_ann_info' in results:
+            data_sample.eval_ann_info = results['eval_ann_info']
+        else:
+            data_sample.eval_ann_info = None
+
+        packed_results = dict()
+        packed_results['data_samples'] = data_sample
+        packed_results['inputs'] = inputs
+
+        return packed_results
+
+    def __repr__(self) -> str:
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f'(keys={self.keys})'
+        repr_str += f'(meta_keys={self.meta_keys})'
+        return repr_str