Unverified Commit 27d0001e authored by yinchimaoliang's avatar yinchimaoliang Committed by GitHub
Browse files

Add modules before mg_head in centerpoint (#46)

* Add centerpoint_rpn and scn, change pillar encoder and voxel_encoder

* Move test_voxel_encoders.

* Change names, add docstring.

* Reconstruct centerpoint_rpn.

* Add centerpoint_rpn.

* Change SECONDFPN, delete centerpoint_fpn

* Remove SparseBasicBlock.

* Change SpMiddleResNetFHD to SparseEncoder.

* Finish SparseEncoder unittest.

* Change test_hard_simple_VFE.

* Change option, add legacy.

* Change docstring, change legacy.

* Fix legacy bug.

* Change unittest, change docstring.

* Change docstring.
parent 5f7b31cc
from torch import nn as nn from torch import nn as nn
from mmdet3d.ops import make_sparse_convmodule from mmdet3d.ops import SparseBasicBlock, make_sparse_convmodule
from mmdet3d.ops import spconv as spconv from mmdet3d.ops import spconv as spconv
from ..registry import MIDDLE_ENCODERS from ..registry import MIDDLE_ENCODERS
...@@ -12,12 +12,19 @@ class SparseEncoder(nn.Module): ...@@ -12,12 +12,19 @@ class SparseEncoder(nn.Module):
Args: Args:
in_channels (int): The number of input channels. in_channels (int): The number of input channels.
sparse_shape (list[int]): The sparse shape of input tensor. sparse_shape (list[int]): The sparse shape of input tensor.
norm_cfg (dict): Config of normalization layer. order (list[str]): Order of conv module. Defaults to ('conv',
'norm', 'act').
norm_cfg (dict): Config of normalization layer. Defaults to
dict(type='BN1d', eps=1e-3, momentum=0.01).
base_channels (int): Out channels for conv_input layer. base_channels (int): Out channels for conv_input layer.
Defaults to 16.
output_channels (int): Out channels for conv_out layer. output_channels (int): Out channels for conv_out layer.
Defaults to 128.
encoder_channels (tuple[tuple[int]]): encoder_channels (tuple[tuple[int]]):
Convolutional channels of each encode block. Convolutional channels of each encode block.
encoder_paddings (tuple[tuple[int]]): Paddings of each encode block. encoder_paddings (tuple[tuple[int]]): Paddings of each encode block.
Defaults to ((16, ), (32, 32, 32), (64, 64, 64), (64, 64, 64)).
block_type (str): Type of the block to use. Defaults to 'conv_module'.
""" """
def __init__(self, def __init__(self,
...@@ -30,8 +37,10 @@ class SparseEncoder(nn.Module): ...@@ -30,8 +37,10 @@ class SparseEncoder(nn.Module):
encoder_channels=((16, ), (32, 32, 32), (64, 64, 64), (64, 64, encoder_channels=((16, ), (32, 32, 32), (64, 64, 64), (64, 64,
64)), 64)),
encoder_paddings=((1, ), (1, 1, 1), (1, 1, 1), ((0, 1, 1), 1, encoder_paddings=((1, ), (1, 1, 1), (1, 1, 1), ((0, 1, 1), 1,
1))): 1)),
block_type='conv_module'):
super().__init__() super().__init__()
assert block_type in ['conv_module', 'basicblock']
self.sparse_shape = sparse_shape self.sparse_shape = sparse_shape
self.in_channels = in_channels self.in_channels = in_channels
self.order = order self.order = order
...@@ -66,7 +75,10 @@ class SparseEncoder(nn.Module): ...@@ -66,7 +75,10 @@ class SparseEncoder(nn.Module):
conv_type='SubMConv3d') conv_type='SubMConv3d')
encoder_out_channels = self.make_encoder_layers( encoder_out_channels = self.make_encoder_layers(
make_sparse_convmodule, norm_cfg, self.base_channels) make_sparse_convmodule,
norm_cfg,
self.base_channels,
block_type=block_type)
self.conv_out = make_sparse_convmodule( self.conv_out = make_sparse_convmodule(
encoder_out_channels, encoder_out_channels,
...@@ -111,17 +123,27 @@ class SparseEncoder(nn.Module): ...@@ -111,17 +123,27 @@ class SparseEncoder(nn.Module):
return spatial_features return spatial_features
def make_encoder_layers(self, make_block, norm_cfg, in_channels): def make_encoder_layers(self,
make_block,
norm_cfg,
in_channels,
block_type='conv_module',
conv_cfg=dict(type='SubMConv3d')):
"""make encoder layers using sparse convs. """make encoder layers using sparse convs.
Args: Args:
make_block (method): A bounded function to build blocks. make_block (method): A bounded function to build blocks.
norm_cfg (dict[str]): Config of normalization layer. norm_cfg (dict[str]): Config of normalization layer.
in_channels (int): The number of encoder input channels. in_channels (int): The number of encoder input channels.
block_type (str): Type of the block to use. Defaults to
'conv_module'.
conv_cfg (dict): Config of conv layer. Defaults to
dict(type='SubMConv3d').
Returns: Returns:
int: The number of encoder output channels. int: The number of encoder output channels.
""" """
assert block_type in ['conv_module', 'basicblock']
self.encoder_layers = spconv.SparseSequential() self.encoder_layers = spconv.SparseSequential()
for i, blocks in enumerate(self.encoder_channels): for i, blocks in enumerate(self.encoder_channels):
...@@ -130,7 +152,7 @@ class SparseEncoder(nn.Module): ...@@ -130,7 +152,7 @@ class SparseEncoder(nn.Module):
padding = tuple(self.encoder_paddings[i])[j] padding = tuple(self.encoder_paddings[i])[j]
# each stage started with a spconv layer # each stage started with a spconv layer
# except the first stage # except the first stage
if i != 0 and j == 0: if i != 0 and j == 0 and block_type == 'conv_module':
blocks_list.append( blocks_list.append(
make_block( make_block(
in_channels, in_channels,
...@@ -141,6 +163,26 @@ class SparseEncoder(nn.Module): ...@@ -141,6 +163,26 @@ class SparseEncoder(nn.Module):
padding=padding, padding=padding,
indice_key=f'spconv{i + 1}', indice_key=f'spconv{i + 1}',
conv_type='SparseConv3d')) conv_type='SparseConv3d'))
elif block_type == 'basicblock':
if j == len(blocks) - 1 and i != len(
self.encoder_channels) - 1:
blocks_list.append(
make_block(
in_channels,
out_channels,
3,
norm_cfg=norm_cfg,
stride=2,
padding=padding,
indice_key=f'spconv{i + 1}',
conv_type='SparseConv3d'))
else:
blocks_list.append(
SparseBasicBlock(
out_channels,
out_channels,
norm_cfg=norm_cfg,
conv_cfg=conv_cfg))
else: else:
blocks_list.append( blocks_list.append(
make_block( make_block(
......
import numpy as np
import torch import torch
from mmcv.cnn import (build_norm_layer, build_upsample_layer, constant_init, from mmcv.cnn import (build_conv_layer, build_norm_layer, build_upsample_layer,
is_norm, kaiming_init) constant_init, is_norm, kaiming_init)
from torch import nn as nn from torch import nn as nn
from mmdet.models import NECKS from mmdet.models import NECKS
...@@ -11,11 +12,14 @@ class SECONDFPN(nn.Module): ...@@ -11,11 +12,14 @@ class SECONDFPN(nn.Module):
"""FPN used in SECOND/PointPillars/PartA2/MVXNet. """FPN used in SECOND/PointPillars/PartA2/MVXNet.
Args: Args:
in_channels (list[int]): Input channels of multi-scale feature maps in_channels (list[int]): Input channels of multi-scale feature maps.
out_channels (list[int]): Output channels of feature maps out_channels (list[int]): Output channels of feature maps.
upsample_strides (list[int]): Strides used to upsample the feature maps upsample_strides (list[int]): Strides used to upsample the
norm_cfg (dict): Config dict of normalization layers feature maps.
upsample_cfg (dict): Config dict of upsample layers norm_cfg (dict): Config dict of normalization layers.
upsample_cfg (dict): Config dict of upsample layers.
conv_cfg (dict): Config dict of conv layers.
use_conv_for_no_stride (bool): Whether to use conv when stride is 1.
""" """
def __init__(self, def __init__(self,
...@@ -23,7 +27,9 @@ class SECONDFPN(nn.Module): ...@@ -23,7 +27,9 @@ class SECONDFPN(nn.Module):
out_channels=[256, 256, 256], out_channels=[256, 256, 256],
upsample_strides=[1, 2, 4], upsample_strides=[1, 2, 4],
norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01), norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
upsample_cfg=dict(type='deconv', bias=False)): upsample_cfg=dict(type='deconv', bias=False),
conv_cfg=dict(type='Conv2d', bias=False),
use_conv_for_no_stride=False):
# if for GroupNorm, # if for GroupNorm,
# cfg is dict(type='GN', num_groups=num_groups, eps=1e-3, affine=True) # cfg is dict(type='GN', num_groups=num_groups, eps=1e-3, affine=True)
super(SECONDFPN, self).__init__() super(SECONDFPN, self).__init__()
...@@ -33,12 +39,23 @@ class SECONDFPN(nn.Module): ...@@ -33,12 +39,23 @@ class SECONDFPN(nn.Module):
deblocks = [] deblocks = []
for i, out_channel in enumerate(out_channels): for i, out_channel in enumerate(out_channels):
stride = upsample_strides[i]
if stride > 1 or (stride == 1 and not use_conv_for_no_stride):
upsample_layer = build_upsample_layer( upsample_layer = build_upsample_layer(
upsample_cfg, upsample_cfg,
in_channels=in_channels[i], in_channels=in_channels[i],
out_channels=out_channel, out_channels=out_channel,
kernel_size=upsample_strides[i], kernel_size=upsample_strides[i],
stride=upsample_strides[i]) stride=upsample_strides[i])
else:
stride = np.round(1 / stride).astype(np.int64)
upsample_layer = build_conv_layer(
conv_cfg,
in_channels=in_channels[i],
out_channels=out_channel,
kernel_size=stride,
stride=stride)
deblock = nn.Sequential(upsample_layer, deblock = nn.Sequential(upsample_layer,
build_norm_layer(norm_cfg, out_channel)[1], build_norm_layer(norm_cfg, out_channel)[1],
nn.ReLU(inplace=True)) nn.ReLU(inplace=True))
......
...@@ -31,6 +31,8 @@ class PillarFeatureNet(nn.Module): ...@@ -31,6 +31,8 @@ class PillarFeatureNet(nn.Module):
Defaults to dict(type='BN1d', eps=1e-3, momentum=0.01). Defaults to dict(type='BN1d', eps=1e-3, momentum=0.01).
mode (str, optional): The mode to gather point features. Options are mode (str, optional): The mode to gather point features. Options are
'max' or 'avg'. Defaults to 'max'. 'max' or 'avg'. Defaults to 'max'.
legacy (bool): Whether to use the new behavior or
the original behavior. Defaults to True.
""" """
def __init__(self, def __init__(self,
...@@ -42,9 +44,11 @@ class PillarFeatureNet(nn.Module): ...@@ -42,9 +44,11 @@ class PillarFeatureNet(nn.Module):
voxel_size=(0.2, 0.2, 4), voxel_size=(0.2, 0.2, 4),
point_cloud_range=(0, -40, -3, 70.4, 40, 1), point_cloud_range=(0, -40, -3, 70.4, 40, 1),
norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01), norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
mode='max'): mode='max',
legacy=True):
super(PillarFeatureNet, self).__init__() super(PillarFeatureNet, self).__init__()
assert len(feat_channels) > 0 assert len(feat_channels) > 0
self.legacy = legacy
if with_cluster_center: if with_cluster_center:
in_channels += 3 in_channels += 3
if with_voxel_center: if with_voxel_center:
...@@ -89,7 +93,7 @@ class PillarFeatureNet(nn.Module): ...@@ -89,7 +93,7 @@ class PillarFeatureNet(nn.Module):
features (torch.Tensor): Point features or raw points in shape features (torch.Tensor): Point features or raw points in shape
(N, M, C). (N, M, C).
num_points (torch.Tensor): Number of points in each pillar. num_points (torch.Tensor): Number of points in each pillar.
coors (torch.Tensor): Coordinates of each voxel coors (torch.Tensor): Coordinates of each voxel.
Returns: Returns:
torch.Tensor: Features of pillars. torch.Tensor: Features of pillars.
...@@ -104,7 +108,17 @@ class PillarFeatureNet(nn.Module): ...@@ -104,7 +108,17 @@ class PillarFeatureNet(nn.Module):
features_ls.append(f_cluster) features_ls.append(f_cluster)
# Find distance of x, y, and z from pillar center # Find distance of x, y, and z from pillar center
dtype = features.dtype
if self._with_voxel_center: if self._with_voxel_center:
if not self.legacy:
f_center = torch.zeros_like(features[:, :, :2])
f_center[:, :, 0] = features[:, :, 0] - (
coors[:, 3].to(dtype).unsqueeze(1) * self.vx +
self.x_offset)
f_center[:, :, 1] = features[:, :, 1] - (
coors[:, 2].to(dtype).unsqueeze(1) * self.vy +
self.y_offset)
else:
f_center = features[:, :, :2] f_center = features[:, :, :2]
f_center[:, :, 0] = f_center[:, :, 0] - ( f_center[:, :, 0] = f_center[:, :, 0] - (
coors[:, 3].type_as(features).unsqueeze(1) * self.vx + coors[:, 3].type_as(features).unsqueeze(1) * self.vx +
......
...@@ -13,10 +13,14 @@ class HardSimpleVFE(nn.Module): ...@@ -13,10 +13,14 @@ class HardSimpleVFE(nn.Module):
"""Simple voxel feature encoder used in SECOND. """Simple voxel feature encoder used in SECOND.
It simply averages the values of points in a voxel. It simply averages the values of points in a voxel.
Args:
num_features (int): Number of features to use. Default: 4.
""" """
def __init__(self): def __init__(self, num_features=4):
super(HardSimpleVFE, self).__init__() super(HardSimpleVFE, self).__init__()
self.num_features = num_features
def forward(self, features, num_points, coors): def forward(self, features, num_points, coors):
"""Forward function. """Forward function.
...@@ -32,7 +36,7 @@ class HardSimpleVFE(nn.Module): ...@@ -32,7 +36,7 @@ class HardSimpleVFE(nn.Module):
Returns: Returns:
torch.Tensor: Mean of points inside each voxel in shape (N, 3(4)) torch.Tensor: Mean of points inside each voxel in shape (N, 3(4))
""" """
points_mean = features[:, :, :4].sum( points_mean = features[:, :, :self.num_features].sum(
dim=1, keepdim=False) / num_points.type_as(features).view(-1, 1) dim=1, keepdim=False) / num_points.type_as(features).view(-1, 1)
return points_mean.contiguous() return points_mean.contiguous()
......
import pytest
import torch
from mmdet3d.models.builder import build_middle_encoder
def test_sparse_encoder():
if not torch.cuda.is_available():
pytest.skip('test requires GPU and torch+cuda')
sparse_encoder_cfg = dict(
type='SparseEncoder',
in_channels=5,
sparse_shape=[40, 1024, 1024],
order=('conv', 'norm', 'act'),
encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128,
128)),
encoder_paddings=((1, 1, 1), (1, 1, 1), (1, 1, 1), (1, 1, 1), (1, 1,
1)),
option='basicblock')
sparse_encoder = build_middle_encoder(sparse_encoder_cfg).cuda()
voxel_features = torch.rand([207842, 5]).cuda()
coors = torch.randint(0, 4, [207842, 4]).cuda()
ret = sparse_encoder(voxel_features, coors, 4)
assert ret.shape == torch.Size([4, 256, 128, 128])
import torch
from mmdet3d.models.builder import build_backbone, build_neck
def test_centerpoint_fpn():
second_cfg = dict(
type='SECOND',
in_channels=64,
out_channels=[64, 128, 256],
layer_nums=[3, 5, 5],
layer_strides=[2, 2, 2],
norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
conv_cfg=dict(type='Conv2d', bias=False))
second = build_backbone(second_cfg)
# centerpoint usage of fpn
centerpoint_fpn_cfg = dict(
type='SECONDFPN',
in_channels=[64, 128, 256],
out_channels=[128, 128, 128],
upsample_strides=[0.5, 1, 2],
norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
upsample_cfg=dict(type='deconv', bias=False),
use_conv_for_no_stride=True)
# original usage of fpn
fpn_cfg = dict(
type='SECONDFPN',
in_channels=[64, 128, 256],
upsample_strides=[1, 2, 4],
out_channels=[128, 128, 128])
second_fpn = build_neck(fpn_cfg)
centerpoint_second_fpn = build_neck(centerpoint_fpn_cfg)
input = torch.rand([4, 64, 512, 512])
sec_output = second(input)
centerpoint_output = centerpoint_second_fpn(sec_output)
second_output = second_fpn(sec_output)
assert centerpoint_output[0].shape == torch.Size([4, 384, 128, 128])
assert second_output[0].shape == torch.Size([4, 384, 256, 256])
import torch
from mmdet3d.models.builder import build_voxel_encoder
def test_pillar_feature_net():
pillar_feature_net_cfg = dict(
type='PillarFeatureNet',
in_channels=5,
feat_channels=[64],
with_distance=False,
voxel_size=(0.2, 0.2, 8),
point_cloud_range=(-51.2, -51.2, -5.0, 51.2, 51.2, 3.0),
norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01))
pillar_feature_net = build_voxel_encoder(pillar_feature_net_cfg)
features = torch.rand([97297, 20, 5])
num_voxels = torch.randint(1, 100, [97297])
coors = torch.randint(0, 100, [97297, 4])
features = pillar_feature_net(features, num_voxels, coors)
assert features.shape == torch.Size([97297, 64])
def test_hard_simple_VFE():
hard_simple_VFE_cfg = dict(type='HardSimpleVFE', num_features=5)
hard_simple_VFE = build_voxel_encoder(hard_simple_VFE_cfg)
features = torch.rand([240000, 10, 5])
num_voxels = torch.randint(1, 10, [240000])
outputs = hard_simple_VFE(features, num_voxels, None)
assert outputs.shape == torch.Size([240000, 5])
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment