[Refactor] Support imvoxelnet at SUN RGB-D on 1.x branch (#2141)

* Support imvoxelnet@sunrgbd on 1.x branch * Add unit tests * Update README.md * Update imvoxelnet_2xb4_sunrgbd-3d-10class.py * Add typehints * Fix lint * Fix BC-breaking caused by updated keys * Add coord_type in the imvoxelnet kitti config

[Refactor] Support imvoxelnet at SUN RGB-D on 1.x branch (#2141)
* Support imvoxelnet@sunrgbd on 1.x branch * Add unit tests * Update README.md * Update imvoxelnet_2xb4_sunrgbd-3d-10class.py * Add typehints * Fix lint * Fix BC-breaking caused by updated keys * Add coord_type in the imvoxelnet kitti config
9073a3b5 · Tai-Wang · GitHub · bd1525ec · 9073a3b5 · 9073a3b5
Unverified Commit 9073a3b5 authored Jan 05, 2023 by Tai-Wang Committed by GitHub Jan 05, 2023
11 changed files
--- a/configs/imvoxelnet/README.md
+++ b/configs/imvoxelnet/README.md
@@ -26,6 +26,12 @@ Results for SUN RGB-D, ScanNet and nuScenes are currently available in ImVoxelNe
 | :--------------------------------------------: | :---: | :-----: | :------: | :------------: | :---: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
 | [ResNet-50](./imvoxelnet_8xb4_kitti-3d-car.py) |  Car  |   3x    |          |                | 17.26 | [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/imvoxelnet/imvoxelnet_4x8_kitti-3d-car/imvoxelnet_4x8_kitti-3d-car_20210830_003014-3d0ffdf4.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/imvoxelnet/imvoxelnet_4x8_kitti-3d-car/imvoxelnet_4x8_kitti-3d-car_20210830_003014.log.json) |

+### SUN RGB-D
+
+|                      Backbone                       | Lr schd | Mem (GB) | Inf time (fps) | mAP@0.25 | mAP@0.5 |                                                                                                                                                                           Download                                                                                                                                                                           |
+| :-------------------------------------------------: | :-----: | :------: | :------------: | :------: | :-----: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| [ResNet-50](./imvoxelnet_4x2_sunrgbd-3d-10class.py) |   2x    |   7.2    |      22.5      |  40.96   |  13.50  | [model](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/imvoxelnet/imvoxelnet_4x2_sunrgbd-3d-10class/imvoxelnet_4x2_sunrgbd-3d-10class_20220809_184416-29ca7d2e.pth) \| [log](https://download.openmmlab.com/mmdetection3d/v1.0.0_models/imvoxelnet/imvoxelnet_4x2_sunrgbd-3d-10class/imvoxelnet_4x2_sunrgbd-3d-10class_20220809_184416.log.json) |
+
 ## Citation

 ```latex

--- a/configs/imvoxelnet/imvoxelnet_2xb4_sunrgbd-3d-10class.py
+++ b/configs/imvoxelnet/imvoxelnet_2xb4_sunrgbd-3d-10class.py
+_base_ = [
+    '../_base_/schedules/mmdet-schedule-1x.py', '../_base_/default_runtime.py'
+]
+prior_generator = dict(
+    type='AlignedAnchor3DRangeGenerator',
+    ranges=[[-3.2, -0.2, -2.28, 3.2, 6.2, 0.28]],
+    rotations=[.0])
+model = dict(
+    type='ImVoxelNet',
+    data_preprocessor=dict(
+        type='Det3DDataPreprocessor',
+        mean=[123.675, 116.28, 103.53],
+        std=[58.395, 57.12, 57.375],
+        bgr_to_rgb=True,
+        pad_size_divisor=32),
+    backbone=dict(
+        type='mmdet.ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
+        style='pytorch'),
+    neck=dict(
+        type='mmdet.FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=4),
+    neck_3d=dict(
+        type='IndoorImVoxelNeck',
+        in_channels=256,
+        out_channels=128,
+        n_blocks=[1, 1, 1]),
+    bbox_head=dict(
+        type='ImVoxelHead',
+        n_classes=10,
+        n_levels=3,
+        n_channels=128,
+        n_reg_outs=7,
+        pts_assign_threshold=27,
+        pts_center_threshold=18,
+        prior_generator=prior_generator),
+    prior_generator=prior_generator,
+    n_voxels=[40, 40, 16],
+    coord_type='DEPTH',
+    train_cfg=dict(),
+    test_cfg=dict(nms_pre=1000, iou_thr=.25, score_thr=.01))
+
+dataset_type = 'SUNRGBDDataset'
+data_root = 'data/sunrgbd/'
+class_names = [
+    'bed', 'table', 'sofa', 'chair', 'toilet', 'desk', 'dresser',
+    'night_stand', 'bookshelf', 'bathtub'
+]
+metainfo = dict(CLASSES=class_names)
+
+file_client_args = dict(backend='disk')
+# Uncomment the following if use ceph or other file clients.
+# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
+# for more details.
+# file_client_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/sunrgbd/':
+#         's3://openmmlab/datasets/detection3d/sunrgbd_processed/',
+#         'data/sunrgbd/':
+#         's3://openmmlab/datasets/detection3d/sunrgbd_processed/'
+#     }))
+
+train_pipeline = [
+    dict(type='LoadAnnotations3D'),
+    dict(type='LoadImageFromFile', file_client_args=file_client_args),
+    dict(type='RandomResize', scale=[(512, 384), (768, 576)], keep_ratio=True),
+    dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
+    dict(type='Pack3DDetInputs', keys=['img', 'gt_bboxes_3d', 'gt_labels_3d'])
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', file_client_args=file_client_args),
+    dict(type='Resize', scale=(640, 480), keep_ratio=True),
+    dict(type='Pack3DDetInputs', keys=['img'])
+]
+
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=2,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='sunrgbd_infos_train.pkl',
+            pipeline=train_pipeline,
+            test_mode=False,
+            filter_empty_gt=True,
+            box_type_3d='Depth',
+            metainfo=metainfo)))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=1,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='sunrgbd_infos_val.pkl',
+        pipeline=test_pipeline,
+        test_mode=True,
+        box_type_3d='Depth',
+        metainfo=metainfo))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='IndoorMetric',
+    ann_file=data_root + 'sunrgbd_infos_val.pkl',
+    metric='bbox')
+test_evaluator = val_evaluator
+
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(
+        _delete_=True, type='AdamW', lr=0.0001, weight_decay=0.0001),
+    paramwise_cfg=dict(
+        custom_keys={'backbone': dict(lr_mult=0.1, decay_mult=1.0)}),
+    clip_grad=dict(max_norm=35., norm_type=2))
+param_scheduler = [
+    dict(
+        type='MultiStepLR',
+        begin=0,
+        end=12,
+        by_epoch=True,
+        milestones=[8, 11],
+        gamma=0.1)
+]
+
+# hooks
+default_hooks = dict(checkpoint=dict(type='CheckpointHook', max_keep_ckpts=1))
+
+# runtime
+find_unused_parameters = True  # only 1 of 4 FPN outputs is used
--- a/configs/imvoxelnet/imvoxelnet_8xb4_kitti-3d-car.py
+++ b/configs/imvoxelnet/imvoxelnet_8xb4_kitti-3d-car.py
@@ -52,7 +52,8 @@ model = dict(
            type='mmdet.CrossEntropyLoss', use_sigmoid=False,
            loss_weight=0.2)),
    n_voxels=[216, 248, 12],
-    anchor_generator=dict(
+    coord_type='LIDAR',
+    prior_generator=dict(
        type='AlignedAnchor3DRangeGenerator',
        ranges=[[-0.16, -39.68, -3.08, 68.96, 39.68, 0.76]],
        rotations=[.0]),

--- a/mmdet3d/models/dense_heads/__init__.py
+++ b/mmdet3d/models/dense_heads/__init__.py
@@ -9,6 +9,7 @@ from .fcaf3d_head import FCAF3DHead
 from .fcos_mono3d_head import FCOSMono3DHead
 from .free_anchor3d_head import FreeAnchor3DHead
 from .groupfree3d_head import GroupFree3DHead
+from .imvoxel_head import ImVoxelHead
 from .monoflex_head import MonoFlexHead
 from .parta2_rpn_head import PartA2RPNHead
 from .pgd_head import PGDHead
@@ -23,5 +24,5 @@ __all__ = [
    'SSD3DHead', 'BaseConvBboxHead', 'CenterHead', 'ShapeAwareHead',
    'BaseMono3DDenseHead', 'AnchorFreeMono3DHead', 'FCOSMono3DHead',
    'GroupFree3DHead', 'PointRPNHead', 'SMOKEMono3DHead', 'PGDHead',
-    'MonoFlexHead', 'Base3DDenseHead', 'FCAF3DHead'
+    'MonoFlexHead', 'Base3DDenseHead', 'FCAF3DHead', 'ImVoxelHead'
 ]
--- a/mmdet3d/models/dense_heads/imvoxel_head.py
+++ b/mmdet3d/models/dense_heads/imvoxel_head.py
--- a/mmdet3d/models/detectors/imvoxelnet.py
+++ b/mmdet3d/models/detectors/imvoxelnet.py
@@ -7,6 +7,7 @@ from mmengine.structures import InstanceData
 from mmdet3d.models.detectors import Base3DDetector
 from mmdet3d.models.layers.fusion_layers.point_fusion import point_sample
 from mmdet3d.registry import MODELS, TASK_UTILS
+from mmdet3d.structures.bbox_3d import get_proj_mat_by_coord_type
 from mmdet3d.structures.det3d_data_sample import SampleList
 from mmdet3d.utils import ConfigType, OptConfigType, OptInstanceList

@@ -20,9 +21,11 @@ class ImVoxelNet(Base3DDetector):
        neck (:obj:`ConfigDict` or dict): The neck config.
        neck_3d (:obj:`ConfigDict` or dict): The 3D neck config.
        bbox_head (:obj:`ConfigDict` or dict): The bbox head config.
+        prior_generator (:obj:`ConfigDict` or dict): The prior points
+            generator config.
        n_voxels (list): Number of voxels along x, y, z axis.
-        anchor_generator (:obj:`ConfigDict` or dict): The anchor generator
-            config.
+        coord_type (str): The type of coordinates of points cloud:
+            'DEPTH', 'LIDAR', or 'CAMERA'.
        train_cfg (:obj:`ConfigDict` or dict, optional): Config dict of
            training hyper-parameters. Defaults to None.
        test_cfg (:obj:`ConfigDict` or dict, optional): Config dict of test
@@ -39,8 +42,9 @@ class ImVoxelNet(Base3DDetector):
                 neck: ConfigType,
                 neck_3d: ConfigType,
                 bbox_head: ConfigType,
+                 prior_generator: ConfigType,
                 n_voxels: List,
-                 anchor_generator: ConfigType,
+                 coord_type: str,
                 train_cfg: OptConfigType = None,
                 test_cfg: OptConfigType = None,
                 data_preprocessor: OptConfigType = None,
@@ -53,8 +57,9 @@ class ImVoxelNet(Base3DDetector):
        bbox_head.update(train_cfg=train_cfg)
        bbox_head.update(test_cfg=test_cfg)
        self.bbox_head = MODELS.build(bbox_head)
+        self.prior_generator = TASK_UTILS.build(prior_generator)
        self.n_voxels = n_voxels
-        self.anchor_generator = TASK_UTILS.build(anchor_generator)
+        self.coord_type = coord_type
        self.train_cfg = train_cfg
        self.test_cfg = test_cfg

@@ -62,6 +67,8 @@ class ImVoxelNet(Base3DDetector):
                     batch_data_samples: SampleList):
        """Extract 3d features from the backbone -> fpn -> 3d projection.

+        -> 3d neck -> bbox_head.
+
        Args:
            batch_inputs_dict (dict): The model input dict which include
                the 'imgs' key.
@@ -72,7 +79,9 @@ class ImVoxelNet(Base3DDetector):
                as `gt_instance` or `gt_panoptic_seg` or `gt_sem_seg`.

        Returns:
-            torch.Tensor: of shape (N, C_out, N_x, N_y, N_z)
+            Tuple:
+            - torch.Tensor: Features of shape (N, C_out, N_x, N_y, N_z).
+            - torch.Tensor: Valid mask of shape (N, 1, N_x, N_y, N_z).
        """
        img = batch_inputs_dict['imgs']
        batch_img_metas = [
@@ -80,9 +89,9 @@ class ImVoxelNet(Base3DDetector):
        ]
        x = self.backbone(img)
        x = self.neck(x)[0]
-        points = self.anchor_generator.grid_anchors(
-            [self.n_voxels[::-1]], device=img.device)[0][:, :3]
-        volumes = []
+        points = self.prior_generator.grid_anchors([self.n_voxels[::-1]],
+                                                   device=img.device)[0][:, :3]
+        volumes, valid_preds = [], []
        for feature, img_meta in zip(x, batch_img_metas):
            img_scale_factor = (
                points.new_tensor(img_meta['scale_factor'][:2])
@@ -91,13 +100,14 @@ class ImVoxelNet(Base3DDetector):
            img_crop_offset = (
                points.new_tensor(img_meta['img_crop_offset'])
                if 'img_crop_offset' in img_meta.keys() else 0)
-            lidar2img = points.new_tensor(img_meta['lidar2img'])
+            proj_mat = points.new_tensor(
+                get_proj_mat_by_coord_type(img_meta, self.coord_type))
            volume = point_sample(
                img_meta,
                img_features=feature[None, ...],
                points=points,
-                proj_mat=lidar2img,
-                coord_type='LIDAR',
+                proj_mat=points.new_tensor(proj_mat),
+                coord_type=self.coord_type,
                img_scale_factor=img_scale_factor,
                img_crop_offset=img_crop_offset,
                img_flip=img_flip,
@@ -106,9 +116,11 @@ class ImVoxelNet(Base3DDetector):
                aligned=False)
            volumes.append(
                volume.reshape(self.n_voxels[::-1] + [-1]).permute(3, 2, 1, 0))
+            valid_preds.append(
+                ~torch.all(volumes[-1] == 0, dim=0, keepdim=True))
        x = torch.stack(volumes)
        x = self.neck_3d(x)
-        return x
+        return x, torch.stack(valid_preds).float()

    def loss(self, batch_inputs_dict: dict, batch_data_samples: SampleList,
             **kwargs) -> Union[dict, list]:
@@ -126,8 +138,12 @@ class ImVoxelNet(Base3DDetector):
        Returns:
            dict: A dictionary of loss components.
        """
-
-        x = self.extract_feat(batch_inputs_dict, batch_data_samples)
+        x, valid_preds = self.extract_feat(batch_inputs_dict,
+                                           batch_data_samples)
+        # For indoor datasets ImVoxelNet uses ImVoxelHead that handles
+        # mask of visible voxels.
+        if self.coord_type == 'DEPTH':
+            x += (valid_preds, )
        losses = self.bbox_head.loss(x, batch_data_samples, **kwargs)
        return losses

@@ -159,8 +175,14 @@ class ImVoxelNet(Base3DDetector):
                - bboxes_3d (Tensor): Contains a tensor with shape
                    (num_instances, C) where C >=7.
        """
-        x = self.extract_feat(batch_inputs_dict, batch_data_samples)
-        results_list = self.bbox_head.predict(x, batch_data_samples, **kwargs)
+        x, valid_preds = self.extract_feat(batch_inputs_dict,
+                                           batch_data_samples)
+        # For indoor datasets ImVoxelNet uses ImVoxelHead that handles
+        # mask of visible voxels.
+        if self.coord_type == 'DEPTH':
+            x += (valid_preds, )
+        results_list = \
+            self.bbox_head.predict(x, batch_data_samples, **kwargs)
        predictions = self.add_pred_to_datasample(batch_data_samples,
                                                  results_list)
        return predictions
@@ -182,7 +204,12 @@ class ImVoxelNet(Base3DDetector):
        Returns:
            tuple[list]: A tuple of features from ``bbox_head`` forward.
        """
-        x = self.extract_feat(batch_inputs_dict, batch_data_samples)
+        x, valid_preds = self.extract_feat(batch_inputs_dict,
+                                           batch_data_samples)
+        # For indoor datasets ImVoxelNet uses ImVoxelHead that handles
+        # mask of visible voxels.
+        if self.coord_type == 'DEPTH':
+            x += (valid_preds, )
        results = self.bbox_head.forward(x)
        return results


--- a/mmdet3d/models/necks/__init__.py
+++ b/mmdet3d/models/necks/__init__.py
@@ -2,10 +2,11 @@
 from mmdet.models.necks.fpn import FPN

 from .dla_neck import DLANeck
-from .imvoxel_neck import OutdoorImVoxelNeck
+from .imvoxel_neck import IndoorImVoxelNeck, OutdoorImVoxelNeck
 from .pointnet2_fp_neck import PointNetFPNeck
 from .second_fpn import SECONDFPN

 __all__ = [
-    'FPN', 'SECONDFPN', 'OutdoorImVoxelNeck', 'PointNetFPNeck', 'DLANeck'
+    'FPN', 'SECONDFPN', 'OutdoorImVoxelNeck', 'PointNetFPNeck', 'DLANeck',
+    'IndoorImVoxelNeck'
 ]
--- a/mmdet3d/models/necks/imvoxel_neck.py
+++ b/mmdet3d/models/necks/imvoxel_neck.py
 # Copyright (c) OpenMMLab. All rights reserved.
 from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
 from torch import nn

 from mmdet3d.registry import MODELS


 @MODELS.register_module()
-class OutdoorImVoxelNeck(nn.Module):
+class OutdoorImVoxelNeck(BaseModule):
    """Neck for ImVoxelNet outdoor scenario.

    Args:
-        in_channels (int): Input channels of multi-scale feature map.
-        out_channels (int): Output channels of multi-scale feature map.
+        in_channels (int): Number of channels in an input tensor.
+        out_channels (int): Number of channels in all output tensors.
    """

    def __init__(self, in_channels, out_channels):
-        super().__init__()
+        super(OutdoorImVoxelNeck, self).__init__()
        self.model = nn.Sequential(
-            ResModule(in_channels),
+            ResModule(in_channels, in_channels),
            ConvModule(
                in_channels=in_channels,
                out_channels=in_channels * 2,
@@ -27,7 +28,7 @@ class OutdoorImVoxelNeck(nn.Module):
                conv_cfg=dict(type='Conv3d'),
                norm_cfg=dict(type='BN3d'),
                act_cfg=dict(type='ReLU', inplace=True)),
-            ResModule(in_channels * 2),
+            ResModule(in_channels * 2, in_channels * 2),
            ConvModule(
                in_channels=in_channels * 2,
                out_channels=in_channels * 4,
@@ -37,7 +38,7 @@ class OutdoorImVoxelNeck(nn.Module):
                conv_cfg=dict(type='Conv3d'),
                norm_cfg=dict(type='BN3d'),
                act_cfg=dict(type='ReLU', inplace=True)),
-            ResModule(in_channels * 4),
+            ResModule(in_channels * 4, in_channels * 4),
            ConvModule(
                in_channels=in_channels * 4,
                out_channels=out_channels,
@@ -66,31 +67,148 @@ class OutdoorImVoxelNeck(nn.Module):
        pass


+@MODELS.register_module()
+class IndoorImVoxelNeck(BaseModule):
+    """Neck for ImVoxelNet outdoor scenario.
+
+    Args:
+        in_channels (int): Number of channels in an input tensor.
+        out_channels (int): Number of channels in all output tensors.
+        n_blocks (list[int]): Number of blocks for each feature level.
+    """
+
+    def __init__(self, in_channels, out_channels, n_blocks):
+        super(IndoorImVoxelNeck, self).__init__()
+        self.n_scales = len(n_blocks)
+        n_channels = in_channels
+        for i in range(len(n_blocks)):
+            stride = 1 if i == 0 else 2
+            self.__setattr__(f'down_layer_{i}',
+                             self._make_layer(stride, n_channels, n_blocks[i]))
+            n_channels = n_channels * stride
+            if i > 0:
+                self.__setattr__(
+                    f'up_block_{i}',
+                    self._make_up_block(n_channels, n_channels // 2))
+            self.__setattr__(f'out_block_{i}',
+                             self._make_block(n_channels, out_channels))
+
+    def forward(self, x):
+        """Forward function.
+
+        Args:
+            x (torch.Tensor): of shape (N, C_in, N_x, N_y, N_z).
+
+        Returns:
+            list[torch.Tensor]: of shape (N, C_out, N_xi, N_yi, N_zi).
+        """
+        down_outs = []
+        for i in range(self.n_scales):
+            x = self.__getattr__(f'down_layer_{i}')(x)
+            down_outs.append(x)
+        outs = []
+        for i in range(self.n_scales - 1, -1, -1):
+            if i < self.n_scales - 1:
+                x = self.__getattr__(f'up_block_{i + 1}')(x)
+                x = down_outs[i] + x
+            out = self.__getattr__(f'out_block_{i}')(x)
+            outs.append(out)
+        return outs[::-1]
+
+    @staticmethod
+    def _make_layer(stride, n_channels, n_blocks):
+        """Make a layer from several residual blocks.
+
+        Args:
+            stride (int): Stride of the first residual block.
+            n_channels (int): Number of channels of the first residual block.
+            n_blocks (int): Number of residual blocks.
+
+        Returns:
+            torch.nn.Module: With several residual blocks.
+        """
+        blocks = []
+        for i in range(n_blocks):
+            if i == 0 and stride != 1:
+                blocks.append(ResModule(n_channels, n_channels * 2, stride))
+                n_channels = n_channels * 2
+            else:
+                blocks.append(ResModule(n_channels, n_channels))
+        return nn.Sequential(*blocks)
+
+    @staticmethod
+    def _make_block(in_channels, out_channels):
+        """Make a convolutional block.
+
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+
+        Returns:
+            torch.nn.Module: Convolutional block.
+        """
+        return nn.Sequential(
+            nn.Conv3d(in_channels, out_channels, 3, 1, 1, bias=False),
+            nn.BatchNorm3d(out_channels), nn.ReLU(inplace=True))
+
+    @staticmethod
+    def _make_up_block(in_channels, out_channels):
+        """Make upsampling convolutional block.
+
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+
+        Returns:
+            torch.nn.Module: Upsampling convolutional block.
+        """
+
+        return nn.Sequential(
+            nn.ConvTranspose3d(in_channels, out_channels, 2, 2, bias=False),
+            nn.BatchNorm3d(out_channels), nn.ReLU(inplace=True),
+            nn.Conv3d(out_channels, out_channels, 3, 1, 1, bias=False),
+            nn.BatchNorm3d(out_channels), nn.ReLU(inplace=True))
+
+
 class ResModule(nn.Module):
    """3d residual block for ImVoxelNeck.

    Args:
-        n_channels (int): Input channels of a feature map.
+        in_channels (int): Number of channels in input tensor.
+        out_channels (int): Number of channels in output tensor.
+        stride (int, optional): Stride of the block. Defaults to 1.
    """

-    def __init__(self, n_channels):
+    def __init__(self, in_channels, out_channels, stride=1):
        super().__init__()
        self.conv0 = ConvModule(
-            in_channels=n_channels,
-            out_channels=n_channels,
+            in_channels=in_channels,
+            out_channels=out_channels,
            kernel_size=3,
+            stride=stride,
            padding=1,
            conv_cfg=dict(type='Conv3d'),
            norm_cfg=dict(type='BN3d'),
            act_cfg=dict(type='ReLU', inplace=True))
        self.conv1 = ConvModule(
-            in_channels=n_channels,
-            out_channels=n_channels,
+            in_channels=out_channels,
+            out_channels=out_channels,
            kernel_size=3,
            padding=1,
            conv_cfg=dict(type='Conv3d'),
            norm_cfg=dict(type='BN3d'),
            act_cfg=None)
+        if stride != 1:
+            self.downsample = ConvModule(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=1,
+                stride=stride,
+                padding=0,
+                conv_cfg=dict(type='Conv3d'),
+                norm_cfg=dict(type='BN3d'),
+                act_cfg=None)
+        self.stride = stride
        self.activation = nn.ReLU(inplace=True)

    def forward(self, x):
@@ -105,6 +223,8 @@ class ResModule(nn.Module):
        identity = x
        x = self.conv0(x)
        x = self.conv1(x)
-        x = identity + x
+        if self.stride != 1:
+            identity = self.downsample(identity)
+        x = x + identity
        x = self.activation(x)
        return x
--- a/tests/test_models/test_dense_heads/test_fcaf3d_head.py
+++ b/tests/test_models/test_dense_heads/test_fcaf3d_head.py
@@ -9,10 +9,10 @@ from mmdet3d.models.dense_heads import FCAF3DHead
 from mmdet3d.testing import create_detector_inputs


-class TestAnchor3DHead(TestCase):
+class TestFCAF3DHead(TestCase):

    def test_fcaf3d_head_loss(self):
-        """Test anchor head loss when truth is empty and non-empty."""
+        """Test fcaf3d head loss when truth is empty and non-empty."""
        if not torch.cuda.is_available():
            pytest.skip('test requires GPU and torch+cuda')


--- a/tests/test_models/test_dense_heads/test_imvoxel_head.py
+++ b/tests/test_models/test_dense_heads/test_imvoxel_head.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+import pytest
+import torch
+
+from mmdet3d import *  # noqa
+from mmdet3d.models.dense_heads import ImVoxelHead
+from mmdet3d.testing import create_detector_inputs
+
+
+class TestImVoxelHead(TestCase):
+
+    def test_imvoxel_head_loss(self):
+        """Test imvoxel head loss when truth is empty and non-empty."""
+        if not torch.cuda.is_available():
+            pytest.skip('test requires GPU and torch+cuda')
+
+        # build head
+        prior_generator = dict(
+            type='AlignedAnchor3DRangeGenerator',
+            ranges=[[-3.2, -0.2, -2.28, 3.2, 6.2, 0.28]],
+            rotations=[.0])
+        imvoxel_head = ImVoxelHead(
+            n_classes=1,
+            n_levels=1,
+            n_channels=32,
+            n_reg_outs=7,
+            pts_assign_threshold=27,
+            pts_center_threshold=18,
+            prior_generator=prior_generator,
+            center_loss=dict(type='mmdet.CrossEntropyLoss', use_sigmoid=True),
+            bbox_loss=dict(type='RotatedIoU3DLoss'),
+            cls_loss=dict(type='mmdet.FocalLoss'),
+        )
+        imvoxel_head = imvoxel_head.cuda()
+
+        # fake input of head
+        # (x, valid_preds)
+        x = [
+            torch.randn(1, 32, 10, 10, 4).cuda(),
+            torch.ones(1, 1, 10, 10, 4).cuda()
+        ]
+
+        # fake annotation
+        num_gt_instance = 1
+        packed_inputs = create_detector_inputs(
+            with_points=False,
+            with_img=True,
+            img_size=(128, 128),
+            num_gt_instance=num_gt_instance,
+            with_pts_semantic_mask=False,
+            with_pts_instance_mask=False)
+        data_samples = [
+            sample.cuda() for sample in packed_inputs['data_samples']
+        ]
+
+        losses = imvoxel_head.loss(x, data_samples)
+        print(losses)
+        self.assertGreaterEqual(losses['center_loss'], 0)
+        self.assertGreaterEqual(losses['bbox_loss'], 0)
+        self.assertGreaterEqual(losses['cls_loss'], 0)
--- a/tests/test_models/test_detectors/test_imvoxelnet.py
+++ b/tests/test_models/test_detectors/test_imvoxelnet.py
@@ -10,11 +10,12 @@ from mmdet3d.testing import (create_detector_inputs, get_detector_cfg,

 class TestImVoxelNet(unittest.TestCase):

-    def test_imvoxelnet(self):
+    def test_imvoxelnet_kitti(self):
        import mmdet3d.models

        assert hasattr(mmdet3d.models, 'ImVoxelNet')
-        DefaultScope.get_instance('test_ImVoxelNet', scope_name='mmdet3d')
+        DefaultScope.get_instance(
+            'test_imvoxelnet_kitti', scope_name='mmdet3d')
        setup_seed(0)
        imvoxel_net_cfg = get_detector_cfg(
            'imvoxelnet/imvoxelnet_8xb4_kitti-3d-car.py')
@@ -47,3 +48,42 @@ class TestImVoxelNet(unittest.TestCase):
            self.assertGreaterEqual(losses['loss_cls'][0], 0)
            self.assertGreaterEqual(losses['loss_bbox'][0], 0)
            self.assertGreaterEqual(losses['loss_dir'][0], 0)
+
+    def test_imvoxelnet_sunrgbd(self):
+        import mmdet3d.models
+
+        assert hasattr(mmdet3d.models, 'ImVoxelNet')
+        DefaultScope.get_instance(
+            'test_imvoxelnet_sunrgbd', scope_name='mmdet3d')
+        setup_seed(0)
+        imvoxel_net_cfg = get_detector_cfg(
+            'imvoxelnet/imvoxelnet_2xb4_sunrgbd-3d-10class.py')
+        model = MODELS.build(imvoxel_net_cfg)
+        num_gt_instance = 1
+        packed_inputs = create_detector_inputs(
+            with_points=False,
+            with_img=True,
+            img_size=(128, 128),
+            num_gt_instance=num_gt_instance,
+            with_pts_semantic_mask=False,
+            with_pts_instance_mask=False)
+
+        if torch.cuda.is_available():
+            model = model.cuda()
+            # test simple_test
+            with torch.no_grad():
+                data = model.data_preprocessor(packed_inputs, True)
+                torch.cuda.empty_cache()
+                results = model.forward(**data, mode='predict')
+            self.assertEqual(len(results), 1)
+            self.assertIn('bboxes_3d', results[0].pred_instances_3d)
+            self.assertIn('scores_3d', results[0].pred_instances_3d)
+            self.assertIn('labels_3d', results[0].pred_instances_3d)
+
+            # save the memory
+            with torch.no_grad():
+                losses = model.forward(**data, mode='loss')
+
+            self.assertGreaterEqual(losses['center_loss'], 0)
+            self.assertGreaterEqual(losses['bbox_loss'], 0)
+            self.assertGreaterEqual(losses['cls_loss'], 0)