Merge branch 'feature_parta2_backbone' into 'master'

Feature parta2 backbone See merge request open-mmlab/mmdet.3d!9

Merge branch 'feature_parta2_backbone' into 'master'
Feature parta2 backbone See merge request open-mmlab/mmdet.3d!9
2a7c24bb · zhangwenwei · 1d34dfce · ad798fb3 · 2a7c24bb · 2a7c24bb
Commit 2a7c24bb authored May 02, 2020 by zhangwenwei
7 changed files
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -12,7 +12,7 @@ repos:
    hooks:
        - id: isort
  - repo: https://github.com/pre-commit/mirrors-yapf
-    rev: v0.29.0
+    rev: v0.30.0
    hooks:
      - id: yapf
  - repo: https://github.com/pre-commit/pre-commit-hooks

--- a/mmdet3d/models/middle_encoders/__init__.py
+++ b/mmdet3d/models/middle_encoders/__init__.py
 from .pillar_scatter import PointPillarsScatter
 from .sparse_encoder import SparseEncoder
+from .sparse_unet import SparseUNet

-__all__ = ['PointPillarsScatter', 'SparseEncoder']
+__all__ = ['PointPillarsScatter', 'SparseEncoder', 'SparseUNet']
--- a/mmdet3d/models/middle_encoders/sparse_unet.py
+++ b/mmdet3d/models/middle_encoders/sparse_unet.py
+import torch
+import torch.nn as nn
+
+import mmdet3d.ops.spconv as spconv
+from mmdet3d.ops import SparseBasicBlock
+from mmdet.ops import build_norm_layer
+from ..registry import MIDDLE_ENCODERS
+
+
+@MIDDLE_ENCODERS.register_module
+class SparseUNet(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 output_shape,
+                 pre_act=False,
+                 norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
+                 base_channels=16,
+                 output_channels=128,
+                 encoder_channels=((16, ), (32, 32, 32), (64, 64, 64), (64, 64,
+                                                                        64)),
+                 encoder_paddings=((1, ), (1, 1, 1), (1, 1, 1), ((0, 1, 1), 1,
+                                                                 1)),
+                 decoder_channels=((64, 64, 64), (64, 64, 32), (32, 32, 16),
+                                   (16, 16, 16)),
+                 decoder_paddings=((1, 0), (1, 0), (0, 0), (0, 1))):
+        """SparseUNet for PartA^2
+
+        See https://arxiv.org/abs/1907.03670 for more detials.
+
+        Args:
+            in_channels (int): the number of input channels
+            output_shape (list[int]): the shape of output tensor
+            pre_act (bool): use pre_act_block or post_act_block
+            norm_cfg (dict): config of normalization layer
+            base_channels (int): out channels for conv_input layer
+            output_channels (int): out channels for conv_out layer
+            encoder_channels (tuple[tuple[int]]):
+                conv channels of each encode block
+            encoder_paddings (tuple[tuple[int]]): paddings of each encode block
+            decoder_channels (tuple[tuple[int]]):
+                conv channels of each decode block
+            decoder_paddings (tuple[tuple[int]]): paddings of each decode block
+        """
+        super().__init__()
+        self.sparse_shape = output_shape
+        self.output_shape = output_shape
+        self.in_channels = in_channels
+        self.pre_act = pre_act
+        self.base_channels = base_channels
+        self.output_channels = output_channels
+        self.encoder_channels = encoder_channels
+        self.encoder_paddings = encoder_paddings
+        self.decoder_channels = decoder_channels
+        self.decoder_paddings = decoder_paddings
+        self.stage_num = len(self.encoder_channels)
+        # Spconv init all weight on its own
+
+        if pre_act:
+            # TODO: use ConvModule to encapsulate
+            self.conv_input = spconv.SparseSequential(
+                spconv.SubMConv3d(
+                    in_channels,
+                    self.base_channels,
+                    3,
+                    padding=1,
+                    bias=False,
+                    indice_key='subm1'))
+            make_block = self.pre_act_block
+        else:
+            self.conv_input = spconv.SparseSequential(
+                spconv.SubMConv3d(
+                    in_channels,
+                    self.base_channels,
+                    3,
+                    padding=1,
+                    bias=False,
+                    indice_key='subm1'),
+                build_norm_layer(norm_cfg, self.base_channels)[1], nn.ReLU())
+            make_block = self.post_act_block
+
+        encoder_out_channels = self.make_encoder_layers(
+            make_block, norm_cfg, self.base_channels)
+        self.make_decoder_layers(make_block, norm_cfg, encoder_out_channels)
+
+        self.conv_out = spconv.SparseSequential(
+            # [200, 176, 5] -> [200, 176, 2]
+            spconv.SparseConv3d(
+                encoder_out_channels,
+                self.output_channels, (3, 1, 1),
+                stride=(2, 1, 1),
+                padding=0,
+                bias=False,
+                indice_key='spconv_down2'),
+            build_norm_layer(norm_cfg, self.output_channels)[1],
+            nn.ReLU())
+
+    def forward(self, voxel_features, coors, batch_size):
+        """Forward of SparseUNet
+
+        Args:
+            voxel_features (torch.float32): shape [N, C]
+            coors (torch.int32): shape [N, 4](batch_idx, z_idx, y_idx, x_idx)
+            batch_size (int): batch size
+
+        Returns:
+            dict: backbone features
+        """
+        coors = coors.int()
+        input_sp_tensor = spconv.SparseConvTensor(voxel_features, coors,
+                                                  self.sparse_shape,
+                                                  batch_size)
+        x = self.conv_input(input_sp_tensor)
+
+        encode_features = []
+        for encoder_layer in self.encoder_layers:
+            x = encoder_layer(x)
+            encode_features.append(x)
+
+        # for detection head
+        # [200, 176, 5] -> [200, 176, 2]
+        out = self.conv_out(encode_features[-1])
+        spatial_features = out.dense()
+
+        N, C, D, H, W = spatial_features.shape
+        spatial_features = spatial_features.view(N, C * D, H, W)
+
+        # for segmentation head, with output shape:
+        # [400, 352, 11] <- [200, 176, 5]
+        # [800, 704, 21] <- [400, 352, 11]
+        # [1600, 1408, 41] <- [800, 704, 21]
+        # [1600, 1408, 41] <- [1600, 1408, 41]
+        decode_features = []
+        x = encode_features[-1]
+        for i in range(self.stage_num, 0, -1):
+            x = self.decoder_layer_forward(encode_features[i - 1], x,
+                                           getattr(self, f'lateral_layer{i}'),
+                                           getattr(self, f'merge_layer{i}'),
+                                           getattr(self, f'upsample_layer{i}'))
+            decode_features.append(x)
+
+        seg_features = decode_features[-1].features
+
+        ret = dict(
+            spatial_features=spatial_features, seg_features=seg_features)
+
+        return ret
+
+    def decoder_layer_forward(self, x_lateral, x_bottom, lateral_layer,
+                              merge_layer, upsample_layer):
+        """Forward of upsample and residual block.
+
+        Args:
+            x_lateral (SparseConvTensor): lateral tensor
+            x_bottom (SparseConvTensor): feature from bottom layer
+            lateral_layer (SparseBasicBlock): convolution for lateral tensor
+            merge_layer (SparseSequential): convolution for merging features
+            upsample_layer (SparseSequential): convolution for upsampling
+
+        Returns:
+            SparseConvTensor: upsampled feature
+        """
+        x = lateral_layer(x_lateral)
+        x.features = torch.cat((x_bottom.features, x.features), dim=1)
+        x_merge = merge_layer(x)
+        x = self.reduce_channel(x, x_merge.features.shape[1])
+        x.features = x_merge.features + x.features
+        x = upsample_layer(x)
+        return x
+
+    @staticmethod
+    def reduce_channel(x, out_channels):
+        """reduce channel for element-wise addition.
+
+        Args:
+            x (SparseConvTensor): x.features (N, C1)
+            out_channels (int): the number of channel after reduction
+
+        Returns:
+            SparseConvTensor: channel reduced feature
+        """
+        features = x.features
+        n, in_channels = features.shape
+        assert (in_channels % out_channels
+                == 0) and (in_channels >= out_channels)
+
+        x.features = features.view(n, out_channels, -1).sum(dim=2)
+        return x
+
+    def pre_act_block(self,
+                      in_channels,
+                      out_channels,
+                      kernel_size,
+                      indice_key=None,
+                      stride=1,
+                      padding=0,
+                      conv_type='subm',
+                      norm_cfg=None):
+        """Make pre activate sparse convolution block.
+
+        Args:
+            in_channels (int): the number of input channels
+            out_channels (int): the number of out channels
+            kernel_size (int): kernel size of convolution
+            indice_key (str): the indice key used for sparse tensor
+            stride (int): the stride of convolution
+            padding (int or list[int]): the padding number of input
+            conv_type (str): conv type in 'subm', 'spconv' or 'inverseconv'
+            norm_cfg (dict): config of normalization layer
+
+        Returns:
+            spconv.SparseSequential: pre activate sparse convolution block.
+        """
+        # TODO: use ConvModule to encapsulate
+        assert conv_type in ['subm', 'spconv', 'inverseconv']
+
+        if conv_type == 'subm':
+            m = spconv.SparseSequential(
+                build_norm_layer(norm_cfg, in_channels)[1],
+                nn.ReLU(inplace=True),
+                spconv.SubMConv3d(
+                    in_channels,
+                    out_channels,
+                    kernel_size,
+                    padding=padding,
+                    bias=False,
+                    indice_key=indice_key))
+        elif conv_type == 'spconv':
+            m = spconv.SparseSequential(
+                build_norm_layer(norm_cfg, in_channels)[1],
+                nn.ReLU(inplace=True),
+                spconv.SparseConv3d(
+                    in_channels,
+                    out_channels,
+                    kernel_size,
+                    stride=stride,
+                    padding=padding,
+                    bias=False,
+                    indice_key=indice_key))
+        elif conv_type == 'inverseconv':
+            m = spconv.SparseSequential(
+                build_norm_layer(norm_cfg, in_channels)[1],
+                nn.ReLU(inplace=True),
+                spconv.SparseInverseConv3d(
+                    in_channels,
+                    out_channels,
+                    kernel_size,
+                    bias=False,
+                    indice_key=indice_key))
+        else:
+            raise NotImplementedError
+        return m
+
+    def post_act_block(self,
+                       in_channels,
+                       out_channels,
+                       kernel_size,
+                       indice_key,
+                       stride=1,
+                       padding=0,
+                       conv_type='subm',
+                       norm_cfg=None):
+        """Make post activate sparse convolution block.
+
+        Args:
+            in_channels (int): the number of input channels
+            out_channels (int): the number of out channels
+            kernel_size (int): kernel size of convolution
+            indice_key (str): the indice key used for sparse tensor
+            stride (int): the stride of convolution
+            padding (int or list[int]): the padding number of input
+            conv_type (str): conv type in 'subm', 'spconv' or 'inverseconv'
+            norm_cfg (dict[str]): config of normalization layer
+
+        Returns:
+            spconv.SparseSequential: post activate sparse convolution block.
+        """
+        # TODO: use ConvModule to encapsulate
+        assert conv_type in ['subm', 'spconv', 'inverseconv']
+
+        if conv_type == 'subm':
+            m = spconv.SparseSequential(
+                spconv.SubMConv3d(
+                    in_channels,
+                    out_channels,
+                    kernel_size,
+                    bias=False,
+                    indice_key=indice_key),
+                build_norm_layer(norm_cfg, out_channels)[1],
+                nn.ReLU(inplace=True))
+        elif conv_type == 'spconv':
+            m = spconv.SparseSequential(
+                spconv.SparseConv3d(
+                    in_channels,
+                    out_channels,
+                    kernel_size,
+                    stride=stride,
+                    padding=padding,
+                    bias=False,
+                    indice_key=indice_key),
+                build_norm_layer(norm_cfg, out_channels)[1],
+                nn.ReLU(inplace=True))
+        elif conv_type == 'inverseconv':
+            m = spconv.SparseSequential(
+                spconv.SparseInverseConv3d(
+                    in_channels,
+                    out_channels,
+                    kernel_size,
+                    bias=False,
+                    indice_key=indice_key),
+                build_norm_layer(norm_cfg, out_channels)[1],
+                nn.ReLU(inplace=True))
+        else:
+            raise NotImplementedError
+        return m
+
+    def make_encoder_layers(self, make_block, norm_cfg, in_channels):
+        """make encoder layers using sparse convs
+
+        Args:
+            make_block (method): a bounded function to build blocks
+            norm_cfg (dict[str]): config of normalization layer
+            in_channels (int): the number of encoder input channels
+
+        Returns:
+            int: the number of encoder output channels
+        """
+        self.encoder_layers = spconv.SparseSequential()
+        for i, blocks in enumerate(self.encoder_channels):
+            blocks_list = []
+            for j, out_channels in enumerate(tuple(blocks)):
+                padding = tuple(self.encoder_paddings[i])[j]
+                # each stage started with a spconv layer
+                # except the first stage
+                if i != 0 and j == 0:
+                    blocks_list.append(
+                        make_block(
+                            in_channels,
+                            out_channels,
+                            3,
+                            norm_cfg=norm_cfg,
+                            stride=2,
+                            padding=padding,
+                            indice_key=f'spconv{i + 1}',
+                            conv_type='spconv'))
+                else:
+                    blocks_list.append(
+                        make_block(
+                            in_channels,
+                            out_channels,
+                            3,
+                            norm_cfg=norm_cfg,
+                            padding=padding,
+                            indice_key=f'subm{i + 1}'))
+                in_channels = out_channels
+            stage_name = f'encoder_layer{i + 1}'
+            stage_layers = spconv.SparseSequential(*blocks_list)
+            self.encoder_layers.add_module(stage_name, stage_layers)
+        return out_channels
+
+    def make_decoder_layers(self, make_block, norm_cfg, in_channels):
+        """make decoder layers using sparse convs
+
+        Args:
+            make_block (method): a bounded function to build blocks
+            norm_cfg (dict[str]): config of normalization layer
+            in_channels (int): the number of encoder input channels
+
+        Returns:
+            int: the number of encoder output channels
+        """
+        block_num = len(self.decoder_channels)
+        for i, block_channels in enumerate(self.decoder_channels):
+            paddings = self.decoder_paddings[i]
+            setattr(
+                self, f'lateral_layer{block_num - i}',
+                SparseBasicBlock(
+                    in_channels,
+                    block_channels[0],
+                    conv_cfg=dict(
+                        type='SubMConv3d', indice_key=f'subm{block_num - i}'),
+                    norm_cfg=norm_cfg))
+            setattr(
+                self, f'merge_layer{block_num - i}',
+                make_block(
+                    in_channels * 2,
+                    block_channels[1],
+                    3,
+                    norm_cfg=norm_cfg,
+                    padding=paddings[0],
+                    indice_key=f'subm{block_num - i}'))
+            if block_num - i != 1:
+                setattr(
+                    self, f'upsample_layer{block_num - i}',
+                    make_block(
+                        in_channels,
+                        block_channels[2],
+                        3,
+                        norm_cfg=norm_cfg,
+                        padding=paddings[1],
+                        indice_key=f'spconv{block_num - i}',
+                        conv_type='inverseconv'))
+            else:
+                # use submanifold conv instead of inverse conv
+                # in the last block
+                setattr(
+                    self, f'upsample_layer{block_num - i}',
+                    make_block(
+                        in_channels,
+                        block_channels[2],
+                        3,
+                        norm_cfg=norm_cfg,
+                        padding=paddings[1],
+                        indice_key='subm1',
+                        conv_type='subm'))
+            in_channels = block_channels[2]
--- a/mmdet3d/ops/__init__.py
+++ b/mmdet3d/ops/__init__.py
@@ -2,12 +2,29 @@ from mmdet.ops import (RoIAlign, SigmoidFocalLoss, get_compiler_version,
                       get_compiling_cuda_version, nms, roi_align,
                       sigmoid_focal_loss)
 from .norm import NaiveSyncBatchNorm1d, NaiveSyncBatchNorm2d
+from .sparse_block import (SparseBasicBlock, SparseBasicBlockV0,
+                           SparseBottleneck, SparseBottleneckV0)
 from .voxel import DynamicScatter, Voxelization, dynamic_scatter, voxelization

 __all__ = [
-    'nms', 'soft_nms', 'RoIAlign', 'roi_align', 'get_compiler_version',
-    'get_compiling_cuda_version', 'build_conv_layer', 'NaiveSyncBatchNorm1d',
-    'NaiveSyncBatchNorm2d', 'batched_nms', 'Voxelization', 'voxelization',
-    'dynamic_scatter', 'DynamicScatter', 'sigmoid_focal_loss',
-    'SigmoidFocalLoss'
+    'nms',
+    'soft_nms',
+    'RoIAlign',
+    'roi_align',
+    'get_compiler_version',
+    'get_compiling_cuda_version',
+    'build_conv_layer',
+    'NaiveSyncBatchNorm1d',
+    'NaiveSyncBatchNorm2d',
+    'batched_nms',
+    'Voxelization',
+    'voxelization',
+    'dynamic_scatter',
+    'DynamicScatter',
+    'sigmoid_focal_loss',
+    'SigmoidFocalLoss',
+    'SparseBasicBlockV0',
+    'SparseBottleneckV0',
+    'SparseBasicBlock',
+    'SparseBottleneck',
 ]
--- a/mmdet3d/ops/sparse_block.py
+++ b/mmdet3d/ops/sparse_block.py
+from torch import nn
+
+import mmdet3d.ops.spconv as spconv
+from mmdet.models.backbones.resnet import BasicBlock, Bottleneck
+from mmdet.ops import build_norm_layer
+from mmdet.ops.conv import conv_cfg
+
+conv_cfg.update({'SubMConv3d': spconv.SubMConv3d})
+
+
+def conv3x3(in_planes, out_planes, stride=1, indice_key=None):
+    """3x3 submanifold sparse convolution with padding.
+
+    Args:
+        in_planes (int): the number of input channels
+        out_planes (int): the number of output channels
+        stride (int): the stride of convolution
+        indice_key (str): the indice key used for sparse tensor
+
+    Returns:
+        spconv.conv.SubMConv3d: 3x3 submanifold sparse convolution ops
+    """
+    # TODO: deprecate this class
+    return spconv.SubMConv3d(
+        in_planes,
+        out_planes,
+        kernel_size=3,
+        stride=stride,
+        padding=1,
+        bias=False,
+        indice_key=indice_key)
+
+
+def conv1x1(in_planes, out_planes, stride=1, indice_key=None):
+    """1x1 submanifold sparse convolution with padding.
+
+    Args:
+        in_planes (int): the number of input channels
+        out_planes (int): the number of output channels
+        stride (int): the stride of convolution
+        indice_key (str): the indice key used for sparse tensor
+
+    Returns:
+        spconv.conv.SubMConv3d: 1x1 submanifold sparse convolution ops
+    """
+    # TODO: deprecate this class
+    return spconv.SubMConv3d(
+        in_planes,
+        out_planes,
+        kernel_size=1,
+        stride=stride,
+        padding=1,
+        bias=False,
+        indice_key=indice_key)
+
+
+class SparseBasicBlockV0(spconv.SparseModule):
+    expansion = 1
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 downsample=None,
+                 indice_key=None,
+                 norm_cfg=None):
+        """Sparse basic block for PartA^2.
+
+        Sparse basic block implemented with submanifold sparse convolution.
+        """
+        # TODO: deprecate this class
+        super().__init__()
+        self.conv1 = conv3x3(inplanes, planes, stride, indice_key=indice_key)
+        norm_name1, norm_layer1 = build_norm_layer(norm_cfg, planes)
+        self.bn1 = norm_layer1
+        self.relu = nn.ReLU()
+        self.conv2 = conv3x3(planes, planes, indice_key=indice_key)
+        norm_name2, norm_layer2 = build_norm_layer(norm_cfg, planes)
+        self.bn2 = norm_layer2
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        identity = x.features
+
+        assert x.features.dim() == 2, f'x.features.dim()={x.features.dim()}'
+
+        out = self.conv1(x)
+        out.features = self.bn1(out.features)
+        out.features = self.relu(out.features)
+
+        out = self.conv2(out)
+        out.features = self.bn2(out.features)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out.features += identity
+        out.features = self.relu(out.features)
+
+        return out
+
+
+class SparseBottleneckV0(spconv.SparseModule):
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 downsample=None,
+                 indice_key=None,
+                 norm_fn=None):
+        """Sparse bottleneck block for PartA^2.
+
+        Bottleneck block implemented with submanifold sparse convolution.
+        """
+        # TODO: deprecate this class
+        super().__init__()
+        self.conv1 = conv1x1(inplanes, planes, indice_key=indice_key)
+        self.bn1 = norm_fn(planes)
+        self.conv2 = conv3x3(planes, planes, stride, indice_key=indice_key)
+        self.bn2 = norm_fn(planes)
+        self.conv3 = conv1x1(
+            planes, planes * self.expansion, indice_key=indice_key)
+        self.bn3 = norm_fn(planes * self.expansion)
+        self.relu = nn.ReLU()
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        identity = x.features
+
+        out = self.conv1(x)
+        out.features = self.bn1(out.features)
+        out.features = self.relu(out.features)
+
+        out = self.conv2(out)
+        out.features = self.bn2(out.features)
+        out.features = self.relu(out.features)
+
+        out = self.conv3(out)
+        out.features = self.bn3(out.features)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out.features += identity
+        out.features = self.relu(out.features)
+
+        return out
+
+
+class SparseBottleneck(Bottleneck, spconv.SparseModule):
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 downsample=None,
+                 conv_cfg=None,
+                 norm_cfg=None):
+        """Sparse bottleneck block for PartA^2.
+
+        Bottleneck block implemented with submanifold sparse convolution.
+        """
+        spconv.SparseModule.__init__(self)
+        Bottleneck.__init__(
+            self,
+            inplanes,
+            planes,
+            stride=stride,
+            downsample=downsample,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg)
+
+    def forward(self, x):
+        identity = x.features
+
+        out = self.conv1(x)
+        out.features = self.bn1(out.features)
+        out.features = self.relu(out.features)
+
+        out = self.conv2(out)
+        out.features = self.bn2(out.features)
+        out.features = self.relu(out.features)
+
+        out = self.conv3(out)
+        out.features = self.bn3(out.features)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out.features += identity
+        out.features = self.relu(out.features)
+
+        return out
+
+
+class SparseBasicBlock(BasicBlock, spconv.SparseModule):
+    expansion = 1
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 downsample=None,
+                 conv_cfg=None,
+                 norm_cfg=None):
+        """Sparse basic block for PartA^2.
+
+        Sparse basic block implemented with submanifold sparse convolution.
+        """
+        spconv.SparseModule.__init__(self)
+        BasicBlock.__init__(
+            self,
+            inplanes,
+            planes,
+            stride=stride,
+            downsample=downsample,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg)
+
+    def forward(self, x):
+        identity = x.features
+
+        assert x.features.dim() == 2, f'x.features.dim()={x.features.dim()}'
+
+        out = self.conv1(x)
+        out.features = self.norm1(out.features)
+        out.features = self.relu(out.features)
+
+        out = self.conv2(out)
+        out.features = self.norm2(out.features)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out.features += identity
+        out.features = self.relu(out.features)
+
+        return out
--- a/tests/test_roiaware_pool3d.py
+++ b/tests/test_roiaware_pool3d.py
@@ -19,23 +19,10 @@ def test_RoIAwarePool3d():
        dtype=torch.float32).cuda(
        )  # boxes (m, 7) with bottom center in lidar coordinate
    pts = torch.tensor(
-        [
-            [1, 2, 3.3],
-            [1.2, 2.5, 3.0],
-            [0.8, 2.1, 3.5],
-            [1.6, 2.6, 3.6],
-            [0.8, 1.2, 3.9],
-            [-9.2, 21.0, 18.2],
-            [3.8, 7.9, 6.3],
-            [4.7, 3.5, -12.2],
-            [3.8, 7.6, -2],
-            [-10.6, -12.9, -20],
-            [-16, -18, 9],
-            [-21.3, -52, -5],
-            [0, 0, 0],
-            [6, 7, 8],
-            [-2, -3, -4],
-        ],
+        [[1, 2, 3.3], [1.2, 2.5, 3.0], [0.8, 2.1, 3.5], [1.6, 2.6, 3.6],
+         [0.8, 1.2, 3.9], [-9.2, 21.0, 18.2], [3.8, 7.9, 6.3],
+         [4.7, 3.5, -12.2], [3.8, 7.6, -2], [-10.6, -12.9, -20], [-16, -18, 9],
+         [-21.3, -52, -5], [0, 0, 0], [6, 7, 8], [-2, -3, -4]],
        dtype=torch.float32).cuda()  # points (n, 3) in lidar coordinate
    pts_feature = pts.clone()

@@ -83,23 +70,10 @@ def test_points_in_boxes_cpu():
        dtype=torch.float32
    )  # boxes (m, 7) with bottom center in lidar coordinate
    pts = torch.tensor(
-        [
-            [1, 2, 3.3],
-            [1.2, 2.5, 3.0],
-            [0.8, 2.1, 3.5],
-            [1.6, 2.6, 3.6],
-            [0.8, 1.2, 3.9],
-            [-9.2, 21.0, 18.2],
-            [3.8, 7.9, 6.3],
-            [4.7, 3.5, -12.2],
-            [3.8, 7.6, -2],
-            [-10.6, -12.9, -20],
-            [-16, -18, 9],
-            [-21.3, -52, -5],
-            [0, 0, 0],
-            [6, 7, 8],
-            [-2, -3, -4],
-        ],
+        [[1, 2, 3.3], [1.2, 2.5, 3.0], [0.8, 2.1, 3.5], [1.6, 2.6, 3.6],
+         [0.8, 1.2, 3.9], [-9.2, 21.0, 18.2], [3.8, 7.9, 6.3],
+         [4.7, 3.5, -12.2], [3.8, 7.6, -2], [-10.6, -12.9, -20], [-16, -18, 9],
+         [-21.3, -52, -5], [0, 0, 0], [6, 7, 8], [-2, -3, -4]],
        dtype=torch.float32)  # points (n, 3) in lidar coordinate

    point_indices = points_in_boxes_cpu(points=pts, boxes=boxes)
@@ -109,9 +83,3 @@ def test_points_in_boxes_cpu():
        dtype=torch.int32)
    assert point_indices.shape == torch.Size([2, 15])
    assert (point_indices == expected_point_indices).all()
-
-
-if __name__ == '__main__':
-    test_points_in_boxes_cpu()
-    test_points_in_boxes_gpu()
-    test_RoIAwarePool3d()
--- a/tests/test_sparse_unet.py
+++ b/tests/test_sparse_unet.py
+import torch
+
+import mmdet3d.ops.spconv as spconv
+from mmdet3d.ops import SparseBasicBlock, SparseBasicBlockV0
+
+
+def test_SparseUNet():
+    from mmdet3d.models.middle_encoders.sparse_unet import SparseUNet
+    self = SparseUNet(
+        in_channels=4, output_shape=[41, 1600, 1408], pre_act=False)
+
+    # test encoder layers
+    assert len(self.encoder_layers) == 4
+    assert self.encoder_layers.encoder_layer1[0][0].in_channels == 16
+    assert self.encoder_layers.encoder_layer1[0][0].out_channels == 16
+    assert isinstance(self.encoder_layers.encoder_layer1[0][0],
+                      spconv.conv.SubMConv3d)
+    assert isinstance(self.encoder_layers.encoder_layer1[0][1],
+                      torch.nn.modules.batchnorm.BatchNorm1d)
+    assert isinstance(self.encoder_layers.encoder_layer1[0][2],
+                      torch.nn.modules.activation.ReLU)
+    assert self.encoder_layers.encoder_layer4[0][0].in_channels == 64
+    assert self.encoder_layers.encoder_layer4[0][0].out_channels == 64
+    assert isinstance(self.encoder_layers.encoder_layer4[0][0],
+                      spconv.conv.SparseConv3d)
+    assert isinstance(self.encoder_layers.encoder_layer4[2][0],
+                      spconv.conv.SubMConv3d)
+
+    # test decoder layers
+    assert isinstance(self.lateral_layer1, SparseBasicBlock)
+    assert isinstance(self.merge_layer1[0], spconv.conv.SubMConv3d)
+    assert isinstance(self.upsample_layer1[0], spconv.conv.SubMConv3d)
+    assert isinstance(self.upsample_layer2[0], spconv.conv.SparseInverseConv3d)
+
+    voxel_features = torch.tensor([[6.56126, 0.9648336, -1.7339306, 0.315],
+                                   [6.8162713, -2.480431, -1.3616394, 0.36],
+                                   [11.643568, -4.744306, -1.3580885, 0.16],
+                                   [23.482342, 6.5036807, 0.5806964, 0.35]],
+                                  dtype=torch.float32)  # n, point_features
+    coordinates = torch.tensor(
+        [[0, 12, 819, 131], [0, 16, 750, 136], [1, 16, 705, 232],
+         [1, 35, 930, 469]],
+        dtype=torch.int32)  # n, 4(batch, ind_x, ind_y, ind_z)
+
+    unet_ret_dict = self.forward(voxel_features, coordinates, 2)
+    seg_features = unet_ret_dict['seg_features']
+    spatial_features = unet_ret_dict['spatial_features']
+
+    assert seg_features.shape == torch.Size([4, 16])
+    assert spatial_features.shape == torch.Size([2, 256, 200, 176])
+
+
+def test_SparseBasicBlock():
+    voxel_features = torch.tensor([[6.56126, 0.9648336, -1.7339306, 0.315],
+                                   [6.8162713, -2.480431, -1.3616394, 0.36],
+                                   [11.643568, -4.744306, -1.3580885, 0.16],
+                                   [23.482342, 6.5036807, 0.5806964, 0.35]],
+                                  dtype=torch.float32)  # n, point_features
+    coordinates = torch.tensor(
+        [[0, 12, 819, 131], [0, 16, 750, 136], [1, 16, 705, 232],
+         [1, 35, 930, 469]],
+        dtype=torch.int32)  # n, 4(batch, ind_x, ind_y, ind_z)
+
+    # test v0
+    self = SparseBasicBlockV0(
+        4,
+        4,
+        indice_key='subm0',
+        norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01))
+    input_sp_tensor = spconv.SparseConvTensor(voxel_features, coordinates,
+                                              [41, 1600, 1408], 2)
+    out_features = self(input_sp_tensor)
+    assert out_features.features.shape == torch.Size([4, 4])
+
+    # test
+    input_sp_tensor = spconv.SparseConvTensor(voxel_features, coordinates,
+                                              [41, 1600, 1408], 2)
+    self = SparseBasicBlock(
+        4,
+        4,
+        conv_cfg=dict(type='SubMConv3d', indice_key='subm1'),
+        norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01))
+    # test conv and bn layer
+    assert isinstance(self.conv1, spconv.conv.SubMConv3d)
+    assert self.conv1.in_channels == 4
+    assert self.conv1.out_channels == 4
+    assert isinstance(self.conv2, spconv.conv.SubMConv3d)
+    assert self.conv2.out_channels == 4
+    assert self.conv2.out_channels == 4
+    assert self.bn1.eps == 1e-3
+    assert self.bn1.momentum == 0.01
+
+    out_features = self(input_sp_tensor)
+    assert out_features.features.shape == torch.Size([4, 4])