Commit 3b8d508a authored by lishj6's avatar lishj6 🏸
Browse files

init_0905

parent e968ab0f
Pipeline #2906 canceled with stages
# Copyright (c) OpenMMLab. All rights reserved.
import torch.nn as nn
import torch.nn.functional as F
from mmcv.cnn import ConvModule
from mmcv.runner import BaseModule, auto_fp16
from mmdet3d.models.builder import NECKS
@NECKS.register_module()
class CustomFPN(BaseModule):
r"""Feature Pyramid Network.
This is an implementation of paper `Feature Pyramid Networks for Object
Detection <https://arxiv.org/abs/1612.03144>`_.
Args:
in_channels (List[int]): Number of input channels per scale.
out_channels (int): Number of output channels (used at each scale)
num_outs (int): Number of output scales.
start_level (int): Index of the start input backbone level used to
build the feature pyramid. Default: 0.
end_level (int): Index of the end input backbone level (exclusive) to
build the feature pyramid. Default: -1, which means the last level.
add_extra_convs (bool | str): If bool, it decides whether to add conv
layers on top of the original feature maps. Default to False.
If True, it is equivalent to `add_extra_convs='on_input'`.
If str, it specifies the source feature map of the extra convs.
Only the following options are allowed
- 'on_input': Last feat map of neck inputs (i.e. backbone feature).
- 'on_lateral': Last feature map after lateral convs.
- 'on_output': The last output feature map after fpn convs.
relu_before_extra_convs (bool): Whether to apply relu before the extra
conv. Default: False.
no_norm_on_lateral (bool): Whether to apply norm on lateral.
Default: False.
conv_cfg (dict): Config dict for convolution layer. Default: None.
norm_cfg (dict): Config dict for normalization layer. Default: None.
act_cfg (str): Config dict for activation layer in ConvModule.
Default: None.
upsample_cfg (dict): Config dict for interpolate layer.
Default: `dict(mode='nearest')`
init_cfg (dict or list[dict], optional): Initialization config dict.
Example:
>>> import torch
>>> in_channels = [2, 3, 5, 7]
>>> scales = [340, 170, 84, 43]
>>> inputs = [torch.rand(1, c, s, s)
... for c, s in zip(in_channels, scales)]
>>> self = FPN(in_channels, 11, len(in_channels)).eval()
>>> outputs = self.forward(inputs)
>>> for i in range(len(outputs)):
... print(f'outputs[{i}].shape = {outputs[i].shape}')
outputs[0].shape = torch.Size([1, 11, 340, 340])
outputs[1].shape = torch.Size([1, 11, 170, 170])
outputs[2].shape = torch.Size([1, 11, 84, 84])
outputs[3].shape = torch.Size([1, 11, 43, 43])
"""
def __init__(self,
in_channels,
out_channels,
num_outs,
start_level=0,
end_level=-1,
out_ids=[],
add_extra_convs=False,
relu_before_extra_convs=False,
no_norm_on_lateral=False,
conv_cfg=None,
norm_cfg=None,
act_cfg=None,
upsample_cfg=dict(mode='nearest'),
init_cfg=dict(
type='Xavier', layer='Conv2d', distribution='uniform')):
super(CustomFPN, self).__init__(init_cfg)
assert isinstance(in_channels, list)
self.in_channels = in_channels
self.out_channels = out_channels
self.num_ins = len(in_channels)
self.num_outs = num_outs
self.relu_before_extra_convs = relu_before_extra_convs
self.no_norm_on_lateral = no_norm_on_lateral
self.fp16_enabled = False
self.upsample_cfg = upsample_cfg.copy()
self.out_ids = out_ids
if end_level == -1:
self.backbone_end_level = self.num_ins
# assert num_outs >= self.num_ins - start_level
else:
# if end_level < inputs, no extra level is allowed
self.backbone_end_level = end_level
assert end_level <= len(in_channels)
assert num_outs == end_level - start_level
self.start_level = start_level
self.end_level = end_level
self.add_extra_convs = add_extra_convs
assert isinstance(add_extra_convs, (str, bool))
if isinstance(add_extra_convs, str):
# Extra_convs_source choices: 'on_input', 'on_lateral', 'on_output'
assert add_extra_convs in ('on_input', 'on_lateral', 'on_output')
elif add_extra_convs: # True
self.add_extra_convs = 'on_input'
self.lateral_convs = nn.ModuleList()
self.fpn_convs = nn.ModuleList()
for i in range(self.start_level, self.backbone_end_level):
l_conv = ConvModule(
in_channels[i],
out_channels,
1,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg if not self.no_norm_on_lateral else None,
act_cfg=act_cfg,
inplace=False)
self.lateral_convs.append(l_conv)
if i in self.out_ids:
fpn_conv = ConvModule(
out_channels,
out_channels,
3,
padding=1,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=act_cfg,
inplace=False)
self.fpn_convs.append(fpn_conv)
# add extra conv layers (e.g., RetinaNet)
extra_levels = num_outs - self.backbone_end_level + self.start_level
if self.add_extra_convs and extra_levels >= 1:
for i in range(extra_levels):
if i == 0 and self.add_extra_convs == 'on_input':
in_channels = self.in_channels[self.backbone_end_level - 1]
else:
in_channels = out_channels
extra_fpn_conv = ConvModule(
in_channels,
out_channels,
3,
stride=2,
padding=1,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=act_cfg,
inplace=False)
self.fpn_convs.append(extra_fpn_conv)
@auto_fp16()
def forward(self, inputs):
"""Forward function."""
assert len(inputs) == len(self.in_channels)
# build laterals
laterals = [
lateral_conv(inputs[i + self.start_level])
for i, lateral_conv in enumerate(self.lateral_convs)
]
# build top-down path
used_backbone_levels = len(laterals)
for i in range(used_backbone_levels - 1, 0, -1):
# In some cases, fixing `scale factor` (e.g. 2) is preferred, but
# it cannot co-exist with `size` in `F.interpolate`.
if 'scale_factor' in self.upsample_cfg:
laterals[i - 1] += F.interpolate(laterals[i],
**self.upsample_cfg)
else:
prev_shape = laterals[i - 1].shape[2:]
laterals[i - 1] += F.interpolate(
laterals[i], size=prev_shape, **self.upsample_cfg)
# build outputs
# part 1: from original levels
outs = [self.fpn_convs[i](laterals[i]) for i in self.out_ids]
# part 2: add extra levels
if self.num_outs > len(outs):
# use max pool to get more levels on top of outputs
# (e.g., Faster R-CNN, Mask R-CNN)
if not self.add_extra_convs:
for i in range(self.num_outs - used_backbone_levels):
outs.append(F.max_pool2d(outs[-1], 1, stride=2))
# add conv layers on top of original feature maps (RetinaNet)
else:
if self.add_extra_convs == 'on_input':
extra_source = inputs[self.backbone_end_level - 1]
elif self.add_extra_convs == 'on_lateral':
extra_source = laterals[-1]
elif self.add_extra_convs == 'on_output':
extra_source = outs[-1]
else:
raise NotImplementedError
outs.append(self.fpn_convs[used_backbone_levels](extra_source))
for i in range(used_backbone_levels + 1, self.num_outs):
if self.relu_before_extra_convs:
outs.append(self.fpn_convs[i](F.relu(outs[-1])))
else:
outs.append(self.fpn_convs[i](outs[-1]))
return outs
# Copyright (c) Phigent Robotics. All rights reserved.
import torch
import torch.nn as nn
from mmcv.cnn import build_norm_layer
from torch.utils.checkpoint import checkpoint
from mmcv.cnn.bricks import ConvModule
from mmdet.models import NECKS
@NECKS.register_module()
class FPN_LSS(nn.Module):
def __init__(self,
in_channels,
out_channels,
scale_factor=4,
input_feature_index=(0, 2),
norm_cfg=dict(type='BN'),
extra_upsample=2,
lateral=None,
use_input_conv=False):
super(FPN_LSS, self).__init__()
self.input_feature_index = input_feature_index
self.extra_upsample = extra_upsample is not None
self.out_channels = out_channels
# 用于上采样high-level的feature map
self.up = nn.Upsample(
scale_factor=scale_factor, mode='bilinear', align_corners=True)
channels_factor = 2 if self.extra_upsample else 1
self.conv = nn.Sequential(
nn.Conv2d(in_channels, out_channels * channels_factor, kernel_size=3, padding=1, bias=False),
build_norm_layer(norm_cfg, out_channels * channels_factor)[1],
nn.ReLU(inplace=True),
nn.Conv2d(out_channels * channels_factor, out_channels * channels_factor, kernel_size=3,
padding=1, bias=False),
build_norm_layer(norm_cfg, out_channels * channels_factor)[1],
nn.ReLU(inplace=True),
)
if self.extra_upsample:
self.up2 = nn.Sequential(
nn.Upsample(scale_factor=extra_upsample, mode='bilinear', align_corners=True),
nn.Conv2d(out_channels * channels_factor, out_channels, kernel_size=3, padding=1, bias=False),
build_norm_layer(norm_cfg, out_channels)[1],
nn.ReLU(inplace=True),
nn.Conv2d(out_channels, out_channels, kernel_size=1, padding=0)
)
self.lateral = lateral is not None
if self.lateral:
self.lateral_conv = nn.Sequential(
nn.Conv2d(lateral, lateral, kernel_size=1, padding=0, bias=False),
build_norm_layer(norm_cfg, lateral)[1],
nn.ReLU(inplace=True)
)
#@torch.compile
def forward(self, feats):
"""
Args:
feats: List[Tensor,] multi-level features
List[(B, C1, H, W), (B, C2, H/2, W/2), (B, C3, H/4, W/4)]
Returns:
x: (B, C_out, 2*H, 2*W)
"""
x2, x1 = feats[self.input_feature_index[0]], feats[self.input_feature_index[1]]
if self.lateral:
x2 = self.lateral_conv(x2)
x1 = self.up(x1) # (B, C3, H, W)
x1 = torch.cat([x2, x1], dim=1) # (B, C1+C3, H, W)
x = self.conv(x1) # (B, C', H, W)
if self.extra_upsample:
x = self.up2(x) # (B, C_out, 2*H, 2*W)
return x
@NECKS.register_module()
class LSSFPN3D(nn.Module):
def __init__(self,
in_channels,
out_channels,
with_cp=False):
super().__init__()
self.up1 = nn.Upsample(
scale_factor=2, mode='trilinear', align_corners=True)
self.up2 = nn.Upsample(
scale_factor=4, mode='trilinear', align_corners=True)
self.conv = ConvModule(
in_channels,
out_channels,
kernel_size=1,
stride=1,
padding=0,
bias=False,
conv_cfg=dict(type='Conv3d'),
norm_cfg=dict(type='BN3d', ),
act_cfg=dict(type='ReLU', inplace=True))
self.with_cp = with_cp
def forward(self, feats):
"""
Args:
feats: List[
(B, C, Dz, Dy, Dx),
(B, 2C, Dz/2, Dy/2, Dx/2),
(B, 4C, Dz/4, Dy/4, Dx/4)
]
Returns:
x: (B, C, Dz, Dy, Dx)
"""
x_8, x_16, x_32 = feats
x_16 = self.up1(x_16) # (B, 2C, Dz, Dy, Dx)
x_32 = self.up2(x_32) # (B, 4C, Dz, Dy, Dx)
x = torch.cat([x_8, x_16, x_32], dim=1) # (B, 7C, Dz, Dy, Dx)
if self.with_cp:
x = checkpoint(self.conv, x)
else:
x = self.conv(x) # (B, C, Dz, Dy, Dx)
return x
# Copyright (c) OpenMMLab. All rights reserved.
import torch
import torch.nn as nn
import torch.distributed as dist
from mmcv.runner import BaseModule, force_fp32
from mmdet3d.models.builder import NECKS
from ...ops import bev_pool_v2
from ..model_utils import DepthNet
from torch.cuda.amp.autocast_mode import autocast
import torch.nn.functional as F
@NECKS.register_module(force=True)
class LSSViewTransformer(BaseModule):
r"""Lift-Splat-Shoot view transformer with BEVPoolv2 implementation.
Please refer to the `paper <https://arxiv.org/abs/2008.05711>`_ and
`paper <https://arxiv.org/abs/2211.17111>`
Args:
grid_config (dict): Config of grid alone each axis in format of
(lower_bound, upper_bound, interval). axis in {x,y,z,depth}.
input_size (tuple(int)): Size of input images in format of (height,
width).
downsample (int): Down sample factor from the input size to the feature
size.
in_channels (int): Channels of input feature.
out_channels (int): Channels of transformed feature.
accelerate (bool): Whether the view transformation is conducted with
acceleration. Note: the intrinsic and extrinsic of cameras should
be constant when 'accelerate' is set true.
sid (bool): Whether to use Spacing Increasing Discretization (SID)
depth distribution as `STS: Surround-view Temporal Stereo for
Multi-view 3D Detection`.
collapse_z (bool): Whether to collapse in z direction.
"""
def __init__(
self,
grid_config,
input_size,
downsample=16,
in_channels=512,
out_channels=64,
accelerate=False,
sid=False,
collapse_z=True,
):
super(LSSViewTransformer, self).__init__()
self.grid_config = grid_config
self.downsample = downsample
self.create_grid_infos(**grid_config)
self.sid = sid
self.frustum = self.create_frustum(grid_config['depth'],
input_size, downsample) # (D, fH, fW, 3) 3:(u, v, d)
self.out_channels = out_channels
self.in_channels = in_channels
self.depth_net = nn.Conv2d(
in_channels, self.D + self.out_channels, kernel_size=1, padding=0)
self.accelerate = accelerate
self.initial_flag = True
self.collapse_z = collapse_z
def create_grid_infos(self, x, y, z, **kwargs):
"""Generate the grid information including the lower bound, interval,
and size.
Args:
x (tuple(float)): Config of grid alone x axis in format of
(lower_bound, upper_bound, interval).
y (tuple(float)): Config of grid alone y axis in format of
(lower_bound, upper_bound, interval).
z (tuple(float)): Config of grid alone z axis in format of
(lower_bound, upper_bound, interval).
**kwargs: Container for other potential parameters
"""
self.grid_lower_bound = torch.Tensor([cfg[0] for cfg in [x, y, z]]) # (min_x, min_y, min_z)
self.grid_interval = torch.Tensor([cfg[2] for cfg in [x, y, z]]) # (dx, dy, dz)
self.grid_size = torch.Tensor([(cfg[1] - cfg[0]) / cfg[2]
for cfg in [x, y, z]]) # (Dx, Dy, Dz)
def create_frustum(self, depth_cfg, input_size, downsample):
"""Generate the frustum template for each image.
Args:
depth_cfg (tuple(float)): Config of grid alone depth axis in format
of (lower_bound, upper_bound, interval).
input_size (tuple(int)): Size of input images in format of (height,
width).
downsample (int): Down sample scale factor from the input size to
the feature size.
Returns:
frustum: (D, fH, fW, 3) 3:(u, v, d)
"""
H_in, W_in = input_size
H_feat, W_feat = H_in // downsample, W_in // downsample
d = torch.arange(*depth_cfg, dtype=torch.float)\
.view(-1, 1, 1).expand(-1, H_feat, W_feat) # (D, fH, fW)
self.D = d.shape[0]
if self.sid:
d_sid = torch.arange(self.D).float()
depth_cfg_t = torch.tensor(depth_cfg).float()
d_sid = torch.exp(torch.log(depth_cfg_t[0]) + d_sid / (self.D-1) *
torch.log((depth_cfg_t[1]-1) / depth_cfg_t[0]))
d = d_sid.view(-1, 1, 1).expand(-1, H_feat, W_feat)
x = torch.linspace(0, W_in - 1, W_feat, dtype=torch.float)\
.view(1, 1, W_feat).expand(self.D, H_feat, W_feat) # (D, fH, fW)
y = torch.linspace(0, H_in - 1, H_feat, dtype=torch.float)\
.view(1, H_feat, 1).expand(self.D, H_feat, W_feat) # (D, fH, fW)
o = torch.stack((x, y, d), -1) # (D, fH, fW, 3) 3:(u, v, d)
if dist.is_initialized():
return o.to(f"cuda:{dist.get_rank()}", non_blocking=True)
else:
return o
def get_lidar_coor(self, sensor2ego, ego2global, cam2imgs, post_rots, post_trans,
bda):
"""Calculate the locations of the frustum points in the lidar
coordinate system.
Args:
rots (torch.Tensor): Rotation from camera coordinate system to
lidar coordinate system in shape (B, N_cams, 3, 3).
trans (torch.Tensor): Translation from camera coordinate system to
lidar coordinate system in shape (B, N_cams, 3).
cam2imgs (torch.Tensor): Camera intrinsic matrixes in shape
(B, N_cams, 3, 3).
post_rots (torch.Tensor): Rotation in camera coordinate system in
shape (B, N_cams, 3, 3). It is derived from the image view
augmentation.
post_trans (torch.Tensor): Translation in camera coordinate system
derived from image view augmentation in shape (B, N_cams, 3).
Returns:
torch.tensor: Point coordinates in shape
(B, N_cams, D, ownsample, 3)
"""
B, N, _, _ = sensor2ego.shape
# post-transformation
# B x N x D x H x W x 3
#points = self.frustum.to(sensor2ego) - post_trans.view(B, N, 1, 1, 1, 3)
points = self.frustum - post_trans.view(B, N, 1, 1, 1, 3)
points = torch.inverse(post_rots).view(B, N, 1, 1, 1, 3, 3)\
.matmul(points.unsqueeze(-1))
# cam_to_ego
points = torch.cat(
(points[..., :2, :] * points[..., 2:3, :], points[..., 2:3, :]), 5)
combine = sensor2ego[:,:,:3,:3].matmul(torch.inverse(cam2imgs))
points = combine.view(B, N, 1, 1, 1, 3, 3).matmul(points).squeeze(-1)
points += sensor2ego[:,:,:3, 3].view(B, N, 1, 1, 1, 3)
points = bda.view(B, 1, 1, 1, 1, 3,
3).matmul(points.unsqueeze(-1)).squeeze(-1)
return points
#@torch.compile
def get_ego_coor(self, sensor2ego, ego2global, cam2imgs, post_rots, post_trans,
bda):
"""Calculate the locations of the frustum points in the lidar
coordinate system.
Args:
sensor2ego (torch.Tensor): Transformation from camera coordinate system to
ego coordinate system in shape (B, N_cams, 4, 4).
ego2global (torch.Tensor): Translation from ego coordinate system to
global coordinate system in shape (B, N_cams, 4, 4).
cam2imgs (torch.Tensor): Camera intrinsic matrixes in shape
(B, N_cams, 3, 3).
post_rots (torch.Tensor): Rotation in camera coordinate system in
shape (B, N_cams, 3, 3). It is derived from the image view
augmentation.
post_trans (torch.Tensor): Translation in camera coordinate system
derived from image view augmentation in shape (B, N_cams, 3).
bda (torch.Tensor): Transformation in bev. (B, 3, 3)
Returns:
torch.tensor: Point coordinates in shape (B, N, D, fH, fW, 3)
"""
B, N, _, _ = sensor2ego.shape
# post-transformation
# (D, fH, fW, 3) - (B, N, 1, 1, 1, 3) --> (B, N, D, fH, fW, 3)
#points = self.frustum.to(sensor2ego) - post_trans.view(B, N, 1, 1, 1, 3)
points = self.frustum - post_trans.view(B, N, 1, 1, 1, 3)
# (B, N, 1, 1, 1, 3, 3) @ (B, N, D, fH, fW, 3, 1) --> (B, N, D, fH, fW, 3, 1)
points = torch.inverse(post_rots).view(B, N, 1, 1, 1, 3, 3)\
.matmul(points.unsqueeze(-1))
# cam_to_ego
# (B, N_, D, fH, fW, 3, 1) 3: (du, dv, d)
points = torch.cat(
(points[..., :2, :] * points[..., 2:3, :], points[..., 2:3, :]), 5)
# R_{c->e} @ K^-1
combine = sensor2ego[:, :, :3, :3].matmul(torch.inverse(cam2imgs))
# (B, N, 1, 1, 1, 3, 3) @ (B, N, D, fH, fW, 3, 1) --> (B, N, D, fH, fW, 3, 1)
# --> (B, N, D, fH, fW, 3)
points = combine.view(B, N, 1, 1, 1, 3, 3).matmul(points).squeeze(-1)
# (B, N, D, fH, fW, 3) + (B, N, 1, 1, 1, 3) --> (B, N, D, fH, fW, 3)
points += sensor2ego[:, :, :3, 3].view(B, N, 1, 1, 1, 3)
# (B, 1, 1, 1, 3, 3) @ (B, N, D, fH, fW, 3, 1) --> (B, N, D, fH, fW, 3, 1)
# --> (B, N, D, fH, fW, 3)
points = bda.view(B, 1, 1, 1, 1, 3,
3).matmul(points.unsqueeze(-1)).squeeze(-1)
return points
def init_acceleration_v2(self, coor):
"""Pre-compute the necessary information in acceleration including the
index of points in the final feature.
Args:
coor (torch.tensor): Coordinate of points in lidar space in shape
(B, N, D, H, W, 3).
x (torch.tensor): Feature of points in shape
(B, N_cams, D, H, W, C).
"""
ranks_bev, ranks_depth, ranks_feat, \
interval_starts, interval_lengths = \
self.voxel_pooling_prepare_v2(coor)
# ranks_bev: (N_points, ),
# ranks_depth: (N_points, ),
# ranks_feat: (N_points, ),
# interval_starts: (N_pillar, )
# interval_lengths: (N_pillar, )
self.ranks_bev = ranks_bev.int().contiguous()
self.ranks_feat = ranks_feat.int().contiguous()
self.ranks_depth = ranks_depth.int().contiguous()
self.interval_starts = interval_starts.int().contiguous()
self.interval_lengths = interval_lengths.int().contiguous()
def voxel_pooling_v2(self, coor, depth, feat):
"""
Args:
coor: (B, N, D, fH, fW, 3)
depth: (B, N, D, fH, fW)
feat: (B, N, C, fH, fW)
Returns:
bev_feat: (B, C*Dz(=1), Dy, Dx)
"""
ranks_bev, ranks_depth, ranks_feat, \
interval_starts, interval_lengths = \
self.voxel_pooling_prepare_v2(coor)
# ranks_bev: (N_points, ),
# ranks_depth: (N_points, ),
# ranks_feat: (N_points, ),
# interval_starts: (N_pillar, )
# interval_lengths: (N_pillar, )
if ranks_feat is None:
print('warning ---> no points within the predefined '
'bev receptive field')
dummy = torch.zeros(size=[
feat.shape[0], feat.shape[2],
int(self.grid_size[2]),
int(self.grid_size[1]),
int(self.grid_size[0])
]).to(feat) # (B, C, Dz, Dy, Dx)
dummy = torch.cat(dummy.unbind(dim=2), 1) # (B, C*Dz, Dy, Dx)
return dummy
feat = feat.permute(0, 1, 3, 4, 2) # (B, N, fH, fW, C)
bev_feat_shape = (depth.shape[0], int(self.grid_size[2]),
int(self.grid_size[1]), int(self.grid_size[0]),
feat.shape[-1]) # (B, Dz, Dy, Dx, C)
bev_feat = bev_pool_v2(depth, feat, ranks_depth, ranks_feat, ranks_bev,
bev_feat_shape, interval_starts,
interval_lengths) # (B, C, Dz, Dy, Dx)
# collapse Z
if self.collapse_z:
bev_feat = torch.cat(bev_feat.unbind(dim=2), 1) # (B, C*Dz, Dy, Dx)
return bev_feat
#@torch.compile(options={"triton.cudagraphs":True})
def voxel_pooling_prepare_v2(self, coor):
"""Data preparation for voxel pooling.
Args:
coor (torch.tensor): Coordinate of points in the lidar space in
shape (B, N, D, H, W, 3).
Returns:
tuple[torch.tensor]:
ranks_bev: Rank of the voxel that a point is belong to in shape (N_points, ),
rank介于(0, B*Dx*Dy*Dz-1).
ranks_depth: Reserved index of points in the depth space in shape (N_Points),
rank介于(0, B*N*D*fH*fW-1).
ranks_feat: Reserved index of points in the feature space in shape (N_Points),
rank介于(0, B*N*fH*fW-1).
interval_starts: (N_pillar, )
interval_lengths: (N_pillar, )
"""
#with torch.cuda.device(coor.device):
#zero_t = torch.scalar_tensor(0).to(coor.device)
if dist.is_initialized():
dv = f"cuda:{dist.get_rank()}"
else:
return f"cuda"
assert dv == str(coor.device)
with torch.cuda.device(dv):
B, N, D, H, W, _ = coor.shape
num_points = B * N * D * H * W
# record the index of selected points for acceleration purpose
ranks_depth = torch.range(
0, num_points - 1, dtype=torch.int, device=coor.device) # (B*N*D*H*W, ), [0, 1, ..., B*N*D*fH*fW-1]
ranks_feat = torch.range(
0, num_points // D - 1, dtype=torch.int, device=coor.device) # [0, 1, ...,B*N*fH*fW-1]
ranks_feat = ranks_feat.reshape(B, N, 1, H, W)
ranks_feat = ranks_feat.expand(B, N, D, H, W).flatten() # (B*N*D*fH*fW, )
# convert coordinate into the voxel space
# ((B, N, D, fH, fW, 3) - (3, )) / (3, ) --> (B, N, D, fH, fW, 3) 3:(x, y, z) grid coords.
coor = ((coor - self.grid_lower_bound.to(coor)) /
self.grid_interval.to(coor))
coor = coor.long().view(num_points, 3) # (B, N, D, fH, fW, 3) --> (B*N*D*fH*fW, 3)
# (B, N*D*fH*fW) --> (B*N*D*fH*fW, 1)
batch_idx = torch.range(0, B - 1).reshape(B, 1). \
expand(B, num_points // B).reshape(num_points, 1).to(coor)
coor = torch.cat((coor, batch_idx), 1) # (B*N*D*fH*fW, 4) 4: (x, y, z, batch_id)
# filter out points that are outside box
kept = (coor[:, 0] >= 0) & (coor[:, 0] < self.grid_size[0]) & \
(coor[:, 1] >= 0) & (coor[:, 1] < self.grid_size[1]) & \
(coor[:, 2] >= 0) & (coor[:, 2] < self.grid_size[2])
if len(kept) == 0:
return None, None, None, None, None
# (N_points, 4), (N_points, ), (N_points, )
coor, ranks_depth, ranks_feat = \
coor[kept], ranks_depth[kept], ranks_feat[kept]
# get tensors from the same voxel next to each other
ranks_bev = coor[:, 3] * (
self.grid_size[2] * self.grid_size[1] * self.grid_size[0])
ranks_bev += coor[:, 2] * (self.grid_size[1] * self.grid_size[0])
ranks_bev += coor[:, 1] * self.grid_size[0] + coor[:, 0]
order = ranks_bev.argsort()
# (N_points, ), (N_points, ), (N_points, )
ranks_bev, ranks_depth, ranks_feat = \
ranks_bev[order], ranks_depth[order], ranks_feat[order]
kept = torch.ones(
ranks_bev.shape[0], device=ranks_bev.device, dtype=torch.bool)
kept[1:] = ranks_bev[1:] != ranks_bev[:-1]
interval_starts = torch.where(kept)[0].int()
if len(interval_starts) == 0:
return None, None, None, None, None
interval_lengths = torch.zeros_like(interval_starts)
interval_lengths[:-1] = interval_starts[1:] - interval_starts[:-1]
interval_lengths[-1] = ranks_bev.shape[0] - interval_starts[-1]
return ranks_bev.int().contiguous(), ranks_depth.int().contiguous(
), ranks_feat.int().contiguous(), interval_starts.int().contiguous(
), interval_lengths.int().contiguous()
def pre_compute(self, input):
if self.initial_flag:
coor = self.get_ego_coor(*input[1:7]) # (B, N, D, fH, fW, 3)
self.init_acceleration_v2(coor)
self.initial_flag = False
def view_transform_core(self, input, depth, tran_feat):
"""
Args:
input (list(torch.tensor)):
imgs: (B, N, 3, H, W) # N_views = 6 * (N_history + 1)
sensor2egos: (B, N, 4, 4)
ego2globals: (B, N, 4, 4)
intrins: (B, N, 3, 3)
post_rots: (B, N, 3, 3)
post_trans: (B, N, 3)
bda_rot: (B, 3, 3)
depth: (B*N, D, fH, fW)
tran_feat: (B*N, C, fH, fW)
Returns:
bev_feat: (B, C*Dz(=1), Dy, Dx)
depth: (B*N, D, fH, fW)
"""
B, N, C, H, W = input[0].shape
# Lift-Splat
if self.accelerate:
feat = tran_feat.view(B, N, self.out_channels, H, W) # (B, N, C, fH, fW)
feat = feat.permute(0, 1, 3, 4, 2) # (B, N, fH, fW, C)
depth = depth.view(B, N, self.D, H, W) # (B, N, D, fH, fW)
bev_feat_shape = (depth.shape[0], int(self.grid_size[2]),
int(self.grid_size[1]), int(self.grid_size[0]),
feat.shape[-1]) # (B, Dz, Dy, Dx, C)
bev_feat = bev_pool_v2(depth, feat, self.ranks_depth,
self.ranks_feat, self.ranks_bev,
bev_feat_shape, self.interval_starts,
self.interval_lengths) # (B, C, Dz, Dy, Dx)
bev_feat = bev_feat.squeeze(2) # (B, C, Dy, Dx)
else:
coor = self.get_ego_coor(*input[1:7]) # (B, N, D, fH, fW, 3)
bev_feat = self.voxel_pooling_v2(
coor, depth.view(B, N, self.D, H, W),
tran_feat.view(B, N, self.out_channels, H, W)) # (B, C*Dz(=1), Dy, Dx)
return bev_feat, depth
def view_transform(self, input, depth, tran_feat):
"""
Args:
input (list(torch.tensor)):
imgs: (B, N, C, H, W) # N_views = 6 * (N_history + 1)
sensor2egos: (B, N, 4, 4)
ego2globals: (B, N, 4, 4)
intrins: (B, N, 3, 3)
post_rots: (B, N, 3, 3)
post_trans: (B, N, 3)
bda_rot: (B, 3, 3)
depth: (B*N, D, fH, fW)
tran_feat: (B*N, C, fH, fW)
Returns:
bev_feat: (B, C, Dy, Dx)
depth: (B*N, D, fH, fW)
"""
if self.accelerate:
self.pre_compute(input)
return self.view_transform_core(input, depth, tran_feat)
#@torch.compile
def forward(self, input):
"""Transform image-view feature into bird-eye-view feature.
Args:
input (list(torch.tensor)):
imgs: (B, N_views, 3, H, W) # N_views = 6 * (N_history + 1)
sensor2egos: (B, N_views, 4, 4)
ego2globals: (B, N_views, 4, 4)
intrins: (B, N_views, 3, 3)
post_rots: (B, N_views, 3, 3)
post_trans: (B, N_views, 3)
bda_rot: (B, 3, 3)
Returns:
bev_feat: (B, C, Dy, Dx)
depth: (B*N, D, fH, fW)
"""
x = input[0] # (B, N, C_in, fH, fW)
B, N, C, H, W = x.shape
x = x.view(B * N, C, H, W) # (B*N, C_in, fH, fW)
# (B*N, C_in, fH, fW) --> (B*N, D+C, fH, fW)
x = self.depth_net(x)
depth_digit = x[:, :self.D, ...] # (B*N, D, fH, fW)
tran_feat = x[:, self.D:self.D + self.out_channels, ...] # (B*N, C, fH, fW)
depth = depth_digit.softmax(dim=1)
return self.view_transform(input, depth, tran_feat)
def get_mlp_input(self, rot, tran, intrin, post_rot, post_tran, bda):
return None
@NECKS.register_module()
class LSSViewTransformerBEVDepth(LSSViewTransformer):
def __init__(self, loss_depth_weight=3.0, depthnet_cfg=dict(), **kwargs):
super(LSSViewTransformerBEVDepth, self).__init__(**kwargs)
self.loss_depth_weight = loss_depth_weight
self.depth_net = DepthNet(
in_channels=self.in_channels,
mid_channels=self.in_channels,
context_channels=self.out_channels,
depth_channels=self.D,
**depthnet_cfg)
def get_mlp_input(self, sensor2ego, ego2global, intrin, post_rot, post_tran, bda):
"""
Args:
sensor2ego: (B, N_views=6, 4, 4)
ego2global: (B, N_views=6, 4, 4)
intrin: (B, N_views, 3, 3)
post_rot: (B, N_views, 3, 3)
post_tran: (B, N_views, 3)
bda: (B, 3, 3)
Returns:
mlp_input: (B, N_views, 27)
"""
B, N, _, _ = sensor2ego.shape
bda = bda.view(B, 1, 3, 3).repeat(1, N, 1, 1) # (B, 3, 3) --> (B, N, 3, 3)
mlp_input = torch.stack([
intrin[:, :, 0, 0], # fx
intrin[:, :, 1, 1], # fy
intrin[:, :, 0, 2], # cx
intrin[:, :, 1, 2], # cy
post_rot[:, :, 0, 0],
post_rot[:, :, 0, 1],
post_tran[:, :, 0],
post_rot[:, :, 1, 0],
post_rot[:, :, 1, 1],
post_tran[:, :, 1],
bda[:, :, 0, 0],
bda[:, :, 0, 1],
bda[:, :, 1, 0],
bda[:, :, 1, 1],
bda[:, :, 2, 2]
], dim=-1) # (B, N_views, 15)
sensor2ego = sensor2ego[:, :, :3, :].reshape(B, N, -1)
mlp_input = torch.cat([mlp_input, sensor2ego], dim=-1) # (B, N_views, 27)
return mlp_input
def forward(self, input, stereo_metas=None):
"""
Args:
input (list(torch.tensor)):
imgs: (B, N_views, 3, H, W) # N_views = 6 * (N_history + 1)
sensor2egos: (B, N_views, 4, 4)
ego2globals: (B, N_views, 4, 4)
intrins: (B, N_views, 3, 3)
post_rots: (B, N_views, 3, 3)
post_trans: (B, N_views, 3)
bda_rot: (B, 3, 3)
mlp_input: (B, N_views, 27)
stereo_metas: None or dict{
k2s_sensor: (B, N_views, 4, 4)
intrins: (B, N_views, 3, 3)
post_rots: (B, N_views, 3, 3)
post_trans: (B, N_views, 3)
frustum: (D, fH_stereo, fW_stereo, 3) 3:(u, v, d)
cv_downsample: 4,
downsample: self.img_view_transformer.downsample=16,
grid_config: self.img_view_transformer.grid_config,
cv_feat_list: [feat_prev_iv, stereo_feat]
}
Returns:
bev_feat: (B, C, Dy, Dx)
depth: (B*N, D, fH, fW)
"""
(x, rots, trans, intrins, post_rots, post_trans, bda,
mlp_input) = input[:8]
B, N, C, H, W = x.shape
x = x.view(B * N, C, H, W) # (B*N_views, C, fH, fW)
x = self.depth_net(x, mlp_input, stereo_metas) # (B*N_views, D+C_context, fH, fW)
depth_digit = x[:, :self.D, ...] # (B*N_views, D, fH, fW)
tran_feat = x[:, self.D:self.D + self.out_channels, ...] # (B*N_views, C_context, fH, fW)
depth = depth_digit.softmax(dim=1) # (B*N_views, D, fH, fW)
bev_feat, depth = self.view_transform(input, depth, tran_feat)
return bev_feat, depth
def get_downsampled_gt_depth(self, gt_depths):
"""
Input:
gt_depths: (B, N_views, img_h, img_w)
Output:
gt_depths: (B*N_views*fH*fW, D)
"""
B, N, H, W = gt_depths.shape
# (B*N_views, fH, downsample, fW, downsample, 1)
gt_depths = gt_depths.view(B * N,
H // self.downsample, self.downsample,
W // self.downsample, self.downsample,
1)
# (B*N_views, fH, fW, 1, downsample, downsample)
gt_depths = gt_depths.permute(0, 1, 3, 5, 2, 4).contiguous()
# (B*N_views*fH*fW, downsample, downsample)
gt_depths = gt_depths.view(-1, self.downsample * self.downsample)
gt_depths_tmp = torch.where(gt_depths == 0.0,
1e5 * torch.ones_like(gt_depths),
gt_depths)
gt_depths = torch.min(gt_depths_tmp, dim=-1).values
# (B*N_views, fH, fW)
gt_depths = gt_depths.view(B * N, H // self.downsample, W // self.downsample)
if not self.sid:
# (D - (min_dist - interval_dist)) / interval_dist
# = (D - min_dist) / interval_dist + 1
gt_depths = (gt_depths - (self.grid_config['depth'][0] -
self.grid_config['depth'][2])) / \
self.grid_config['depth'][2]
else:
gt_depths = torch.log(gt_depths) - torch.log(
torch.tensor(self.grid_config['depth'][0]).float())
gt_depths = gt_depths * (self.D - 1) / torch.log(
torch.tensor(self.grid_config['depth'][1] - 1.).float() /
self.grid_config['depth'][0])
gt_depths = gt_depths + 1.
gt_depths = torch.where((gt_depths < self.D + 1) & (gt_depths >= 0.0),
gt_depths, torch.zeros_like(gt_depths)) # (B*N_views, fH, fW)
gt_depths = F.one_hot(
gt_depths.long(), num_classes=self.D + 1).view(-1, self.D + 1)[:, 1:] # (B*N_views*fH*fW, D)
return gt_depths.float()
@force_fp32()
def get_depth_loss(self, depth_labels, depth_preds):
"""
Args:
depth_labels: (B, N_views, img_h, img_w)
depth_preds: (B*N_views, D, fH, fW)
Returns:
"""
depth_labels = self.get_downsampled_gt_depth(depth_labels) # (B*N_views*fH*fW, D)
# (B*N_views, D, fH, fW) --> (B*N_views, fH, fW, D) --> (B*N_views*fH*fW, D)
depth_preds = depth_preds.permute(0, 2, 3,
1).contiguous().view(-1, self.D)
fg_mask = torch.max(depth_labels, dim=1).values > 0.0
depth_labels = depth_labels[fg_mask]
depth_preds = depth_preds[fg_mask]
with autocast(enabled=False):
depth_loss = F.binary_cross_entropy(
depth_preds,
depth_labels,
reduction='none',
).sum() / max(1.0, fg_mask.sum())
return self.loss_depth_weight * depth_loss
@NECKS.register_module()
class LSSViewTransformerBEVStereo(LSSViewTransformerBEVDepth):
def __init__(self, **kwargs):
super(LSSViewTransformerBEVStereo, self).__init__(**kwargs)
# (D, fH_stereo, fW_stereo, 3) 3:(u, v, d)
self.cv_frustum = self.create_frustum(kwargs['grid_config']['depth'],
kwargs['input_size'],
downsample=4)
from .bev_pool import bev_pool
from .bev_pool_v2 import bev_pool_v2, TRTBEVPoolv2
from .nearest_assign import nearest_assign
__all__ = ['bev_pool', 'bev_pool_v2', 'TRTBEVPoolv2', 'nearest_assign']
\ No newline at end of file
import torch
from . import bev_pool_ext
class QuickBevPoolingCuda(torch.autograd.Function):
@staticmethod
def forward(ctx, feats, coords, ranks, B, D, H, W, pooling_method):
"""
Args:
ctx:
feats: (N, C)
coords: (N, 4) 4: (x_id, y_id, z_id, batch_id)
ranks: (N, ) eg: (0, 0, 1, 1, 1, 2, 2)
B:
D:
H:
W:
Returns:
out: (B, D, H, W, C)
"""
kept = torch.ones(feats.shape[0], device=feats.device, dtype=torch.bool) # (N, )
kept[1:] = ranks[1:] != ranks[:-1] # 边界点=1, 其余为0(pillar id发生变化) eg:(1, 0, 1, 0, 0, 1, 0)
interval_starts = torch.where(kept)[0].int() # 该pillar的起始位置 (N_pillar, ) eg: (0, 2, 5)
interval_lengths = torch.zeros_like(interval_starts) # pillar包含points的数量 (N_pillar, ) eg: (0, 0, 0)
interval_lengths[:-1] = interval_starts[1:] - interval_starts[:-1] # eg: (0, 2, 5)
interval_lengths[-1] = feats.shape[0] - interval_starts[-1] # eg: (0, 3, 2)
coords = coords.int()
if pooling_method == 'sum':
out = bev_pool_ext.bev_sum_pool_forward(
feats, # (N, C)
coords, # (N, 4) 4: (x_id, y_id, z_id, batch_id)
interval_lengths, # (N_pillar, )
interval_starts, # (N_pillar, )
B,
D,
H,
W,
)
elif pooling_method == 'max':
out = bev_pool_ext.bev_max_pool_forward(
feats, # (N, C)
coords, # (N, 4) 4: (x_id, y_id, z_id, batch_id)
interval_lengths, # (N_pillar, )
interval_starts, # (N_pillar, )
B,
D,
H,
W,
)
ctx.save_for_backward(interval_starts, interval_lengths, coords)
ctx.saved_shapes = B, D, H, W
ctx.pooling_method = pooling_method
return out
@staticmethod
def backward(ctx, out_grad):
"""
Args:
ctx:
out_grad: (B, D, H, W, C)
Returns:
x_grad: (N, C)
"""
# (N_pillar, ), (N_pillar, ), (N, 4) 4: (x_id, y_id, z_id, batch_id)
interval_starts, interval_lengths, geom_coords = ctx.saved_tensors
B, D, H, W = ctx.saved_shapes
pooling_method = ctx.pooling_method
out_grad = out_grad.contiguous()
if pooling_method == 'sum':
x_grad = bev_pool_ext.bev_sum_pool_backward(
out_grad, # (B, D, H, W, C)
geom_coords, # (N, 4) 4: (x_id, y_id, z_id, batch_id)
interval_lengths, # (N_pillar, )
interval_starts, # (N_pillar, )
B,
D,
H,
W,
) # (N, C)
elif pooling_method == 'max':
x_grad = bev_pool_ext.bev_max_pool_backward(
out_grad, # (B, D, H, W, C)
geom_coords, # (N, 4) 4: (x_id, y_id, z_id, batch_id)
interval_lengths, # (N_pillar, )
interval_starts, # (N_pillar, )
B,
D,
H,
W,
) # (N, C)
return x_grad, None, None, None, None, None, None, None
def bev_pool(feats, coords, B, D, H, W, pooling_method='sum'):
"""
Args:
feats: (N, C)
coords: (N, 4) 4: (x_id, y_id, z_id, batch_id)
B:
D: Dz
H: Dy
W: Dx
Returns:
bev_features: (B, C, D, H, W)
"""
assert feats.shape[0] == coords.shape[0]
ranks = (
coords[:, 0] * (H * D * B)
+ coords[:, 1] * (D * B)
+ coords[:, 2] * B
+ coords[:, 3]
) # (N, )
indices = ranks.argsort() # (N, )
# (N, C), (N, 4), (N, )
feats, coords, ranks = feats[indices], coords[indices], ranks[indices]
x = QuickBevPoolingCuda.apply(feats, coords, ranks, B, D, H, W, pooling_method) # (B, D, H, W, C)
x = x.permute(0, 4, 1, 2, 3).contiguous() # (B, C, D, H, W)
return x
#include <torch/torch.h>
#include <c10/cuda/CUDAGuard.h>
#include "bev_max_pool.h"
/*
Function: pillar pooling (forward, cuda)
Args:
geom_feats : input features, FloatTensor[N, C]
_geom_coords : input coordinates, IntTensor[N, 4] 4: (x_id, y_id, z_id, batch_id)
interval_lengths : how many points in each pooled point, IntTensor[N_pillar, ]
interval_starts : starting position for pooled point, IntTensor [N_pillar, ]
Return:
out : output features, FloatTensor[b, d, h, w, c]
*/
at::Tensor bev_max_pool_forward(
const at::Tensor _geom_feats,
const at::Tensor _geom_coords,
const at::Tensor _interval_lengths,
const at::Tensor _interval_starts,
int b, int d, int h, int w
) {
int n = _geom_feats.size(0);
int c = _geom_feats.size(1);
int n_intervals = _interval_lengths.size(0);
const at::cuda::OptionalCUDAGuard device_guard(device_of(_geom_feats));
const float* geom_feats = _geom_feats.data_ptr<float>();
const int* geom_coords = _geom_coords.data_ptr<int>();
const int* interval_lengths = _interval_lengths.data_ptr<int>();
const int* interval_starts = _interval_starts.data_ptr<int>();
auto options =
torch::TensorOptions().dtype(_geom_feats.dtype()).device(_geom_feats.device());
at::Tensor _out = torch::zeros({b, d, h, w, c}, options); // (B, D=Dz, H=Dy, W=Dx, C)
float* out = _out.data_ptr<float>();
bev_max_pool(
b, d, h, w, n, c, n_intervals, geom_feats,
geom_coords, interval_starts, interval_lengths, out
);
return _out;
}
/*
Function: pillar pooling (backward, cuda)
Args:
out_grad : input features, FloatTensor[B, D, H, W, C]
geom_coords : input coordinates, IntTensor[N, 4]
interval_lengths : how many points in each pooled point, IntTensor[N_pillar, ]
interval_starts : starting position for pooled point, IntTensor [N_pillar, ]
Return:
x_grad : output features, FloatTensor[N, C]
*/
at::Tensor bev_max_pool_backward(
const at::Tensor _out_grad,
const at::Tensor _geom_coords,
const at::Tensor _interval_lengths,
const at::Tensor _interval_starts,
int b, int d, int h, int w
) {
int n = _geom_coords.size(0);
int c = _out_grad.size(4);
int n_intervals = _interval_lengths.size(0);
const at::cuda::OptionalCUDAGuard device_guard(device_of(_out_grad));
const float* out_grad = _out_grad.data_ptr<float>();
const int* geom_coords = _geom_coords.data_ptr<int>();
const int* interval_lengths = _interval_lengths.data_ptr<int>();
const int* interval_starts = _interval_starts.data_ptr<int>();
auto options =
torch::TensorOptions().dtype(_out_grad.dtype()).device(_out_grad.device());
at::Tensor _x_grad = torch::zeros({n, c}, options); // (N, C)
float* x_grad = _x_grad.data_ptr<float>();
bev_max_pool_grad(
b, d, h, w, n, c, n_intervals, out_grad,
geom_coords, interval_starts, interval_lengths, x_grad
);
return _x_grad;
}
#ifndef _BEV_MAX_POOL_H
#define _BEV_MAX_POOL_H
#include <torch/torch.h>
#include <c10/cuda/CUDAGuard.h>
at::Tensor bev_max_pool_forward(
const at::Tensor _geom_feats,
const at::Tensor _geom_coords,
const at::Tensor _interval_lengths,
const at::Tensor _interval_starts,
int b, int d, int h, int w
);
at::Tensor bev_max_pool_backward(
const at::Tensor _out_grad,
const at::Tensor _geom_coords,
const at::Tensor _interval_lengths,
const at::Tensor _interval_starts,
int b, int d, int h, int w
);
// CUDA function declarations
void bev_max_pool(int b, int d, int h, int w, int n, int c, int n_intervals, const float* x,
const int* geom_feats, const int* interval_starts, const int* interval_lengths, float* out);
void bev_max_pool_grad(int b, int d, int h, int w, int n, int c, int n_intervals, const float* out_grad,
const int* geom_feats, const int* interval_starts, const int* interval_lengths, float* x_grad);
#endif
\ No newline at end of file
#include <stdio.h>
#include <stdlib.h>
#include "bev_max_pool.h"
/*
Function: pillar pooling
Args:
b : batch size
d : depth of the feature map
h : height of pooled feature map
w : width of pooled feature map
n : number of input points
c : number of channels
n_intervals : number of unique points
geom_feats : input features, FloatTensor[n, c]
geom_coords : input coordinates, IntTensor[n, 4] 4: (x_id, y_id, z_id, batch_id)
interval_starts : how many points in each pooled point, IntTensor[n_intervals]
interval_lengths : starting position for pooled point, IntTensor[n_intervals]
out : output features, FloatTensor[b, d, h, w, c]
*/
__global__ void bev_max_pool_kernel(int b, int d, int h, int w, int n, int c, int n_intervals,
const float *__restrict__ geom_feats,
const int *__restrict__ geom_coords,
const int *__restrict__ interval_starts,
const int *__restrict__ interval_lengths,
float* __restrict__ out) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int index = idx / c;
int cur_c = idx % c;
if (index >= n_intervals) return;
int interval_start = interval_starts[index];
int interval_length = interval_lengths[index];
const int* cur_geom_coords = geom_coords + interval_start * 4; // 当前负责计算的pillar的坐标 4: (x_id, y_id, z_id, batch_id)
const float* cur_geom_feats = geom_feats + interval_start * c + cur_c;
float* cur_out = out + cur_geom_coords[3] * d * h * w * c +
cur_geom_coords[2] * h * w * c + cur_geom_coords[1] * w * c +
cur_geom_coords[0] * c + cur_c;
float pmax = 0;
for(int i = 0; i < interval_length; i++){
if (cur_geom_feats[i * c] > pmax)
pmax = cur_geom_feats[i * c];
}
*cur_out = pmax;
}
/*
Function: pillar pooling backward
Args:
b : batch size
d : depth of the feature map
h : height of pooled feature map
w : width of pooled feature map
n : number of input points
c : number of channels
n_intervals : number of unique points
out_grad : gradient of the BEV fmap from top, FloatTensor[b, d, h, w, c]
geom_coords : input coordinates, IntTensor[N, 4] 4: (x_id, y_id, z_id, batch_id)
interval_lengths : how many points in each pooled point, IntTensor[n_intervals]
interval_starts : starting position for pooled point, IntTensor[n_intervals]
x_grad : gradient of the image fmap, FloatTensor
*/
__global__ void bev_max_pool_grad_kernel(int b, int d, int h, int w, int n, int c, int n_intervals,
const float *__restrict__ out_grad,
const int *__restrict__ geom_coords,
const int *__restrict__ interval_starts,
const int *__restrict__ interval_lengths,
float* __restrict__ x_grad) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int index = idx / c;
int cur_c = idx % c;
if (index >= n_intervals) return;
int interval_start = interval_starts[index];
int interval_length = interval_lengths[index];
// 当前负责计算的pillar的坐标 4: (x_id, y_id, z_id, batch_id)
// 该pillar中所有点的梯度 与 该pillar特征的梯度相同.
const int* cur_geom_coords = geom_coords + interval_start * 4;
float* cur_x_grad = x_grad + interval_start * c + cur_c;
const float* cur_out_grad = out_grad + cur_geom_coords[3] * d * h * w * c +
cur_geom_coords[2] * h * w * c + cur_geom_coords[1] * w * c +
cur_geom_coords[0] * c + cur_c;
int max_id = 0;
float pmax = 0;
for(int i = 0; i < interval_length; i++){
if (cur_x_grad[i * c] > pmax)
{
pmax = cur_x_grad[i * c];
max_id = i;
}
}
cur_x_grad[max_id * c] = *cur_out_grad;
}
void bev_max_pool(int b, int d, int h, int w, int n, int c, int n_intervals, const float* geom_feats,
const int* geom_coords, const int* interval_starts, const int* interval_lengths, float* out) {
bev_max_pool_kernel<<<(int)ceil(((double)n_intervals * c / 256)), 256>>>(
b, d, h, w, n, c, n_intervals, geom_feats, geom_coords, interval_starts, interval_lengths, out
);
}
void bev_max_pool_grad(int b, int d, int h, int w, int n, int c, int n_intervals, const float* out_grad,
const int* geom_coords, const int* interval_starts, const int* interval_lengths, float* x_grad) {
bev_max_pool_grad_kernel<<<(int)ceil(((double)n_intervals * c / 256)), 256>>>(
b, d, h, w, n, c, n_intervals, out_grad, geom_coords, interval_starts, interval_lengths, x_grad
);
}
#include <torch/torch.h>
#include <c10/cuda/CUDAGuard.h>
#include "bev_sum_pool.h"
#include "bev_max_pool.h"
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("bev_sum_pool_forward", &bev_sum_pool_forward,
"bev_sum_pool_forward");
m.def("bev_sum_pool_backward", &bev_sum_pool_backward,
"bev_sum_pool_backward");
m.def("bev_max_pool_forward", &bev_max_pool_forward,
"bev_max_pool_forward");
m.def("bev_max_pool_backward", &bev_max_pool_backward,
"bev_max_pool_backward");
}
\ No newline at end of file
#include <torch/torch.h>
#include <c10/cuda/CUDAGuard.h>
#include "bev_sum_pool.h"
/*
Function: pillar pooling (forward, cuda)
Args:
geom_feats : input features, FloatTensor[N, C]
_geom_coords : input coordinates, IntTensor[N, 4] 4: (x_id, y_id, z_id, batch_id)
interval_lengths : how many points in each pooled point, IntTensor[N_pillar, ]
interval_starts : starting position for pooled point, IntTensor [N_pillar, ]
Return:
out : output features, FloatTensor[b, d, h, w, c]
*/
at::Tensor bev_sum_pool_forward(
const at::Tensor _geom_feats,
const at::Tensor _geom_coords,
const at::Tensor _interval_lengths,
const at::Tensor _interval_starts,
int b, int d, int h, int w
) {
int n = _geom_feats.size(0);
int c = _geom_feats.size(1);
int n_intervals = _interval_lengths.size(0);
const at::cuda::OptionalCUDAGuard device_guard(device_of(_geom_feats));
const float* geom_feats = _geom_feats.data_ptr<float>();
const int* geom_coords = _geom_coords.data_ptr<int>();
const int* interval_lengths = _interval_lengths.data_ptr<int>();
const int* interval_starts = _interval_starts.data_ptr<int>();
auto options =
torch::TensorOptions().dtype(_geom_feats.dtype()).device(_geom_feats.device());
at::Tensor _out = torch::zeros({b, d, h, w, c}, options); // (B, D=Dz, H=Dy, W=Dx, C)
float* out = _out.data_ptr<float>();
bev_sum_pool(
b, d, h, w, n, c, n_intervals, geom_feats,
geom_coords, interval_starts, interval_lengths, out
);
return _out;
}
/*
Function: pillar pooling (backward, cuda)
Args:
out_grad : input features, FloatTensor[B, D, H, W, C]
geom_coords : input coordinates, IntTensor[N, 4]
interval_lengths : how many points in each pooled point, IntTensor[N_pillar, ]
interval_starts : starting position for pooled point, IntTensor [N_pillar, ]
Return:
x_grad : output features, FloatTensor[N, C]
*/
at::Tensor bev_sum_pool_backward(
const at::Tensor _out_grad,
const at::Tensor _geom_coords,
const at::Tensor _interval_lengths,
const at::Tensor _interval_starts,
int b, int d, int h, int w
) {
int n = _geom_coords.size(0);
int c = _out_grad.size(4);
int n_intervals = _interval_lengths.size(0);
const at::cuda::OptionalCUDAGuard device_guard(device_of(_out_grad));
const float* out_grad = _out_grad.data_ptr<float>();
const int* geom_coords = _geom_coords.data_ptr<int>();
const int* interval_lengths = _interval_lengths.data_ptr<int>();
const int* interval_starts = _interval_starts.data_ptr<int>();
auto options =
torch::TensorOptions().dtype(_out_grad.dtype()).device(_out_grad.device());
at::Tensor _x_grad = torch::zeros({n, c}, options); // (N, C)
float* x_grad = _x_grad.data_ptr<float>();
bev_sum_pool_grad(
b, d, h, w, n, c, n_intervals, out_grad,
geom_coords, interval_starts, interval_lengths, x_grad
);
return _x_grad;
}
\ No newline at end of file
#ifndef _BEV_SUM_POOL_H
#define _BEV_SUM_POOL_H
#include <torch/torch.h>
#include <c10/cuda/CUDAGuard.h>
at::Tensor bev_sum_pool_forward(
const at::Tensor _geom_feats,
const at::Tensor _geom_coords,
const at::Tensor _interval_lengths,
const at::Tensor _interval_starts,
int b, int d, int h, int w
);
at::Tensor bev_sum_pool_backward(
const at::Tensor _out_grad,
const at::Tensor _geom_coords,
const at::Tensor _interval_lengths,
const at::Tensor _interval_starts,
int b, int d, int h, int w
);
// CUDA function declarations
void bev_sum_pool(int b, int d, int h, int w, int n, int c, int n_intervals, const float* x,
const int* geom_feats, const int* interval_starts, const int* interval_lengths, float* out);
void bev_sum_pool_grad(int b, int d, int h, int w, int n, int c, int n_intervals, const float* out_grad,
const int* geom_feats, const int* interval_starts, const int* interval_lengths, float* x_grad);
#endif
\ No newline at end of file
#include <stdio.h>
#include <stdlib.h>
#include "bev_sum_pool.h"
/*
Function: pillar pooling
Args:
b : batch size
d : depth of the feature map
h : height of pooled feature map
w : width of pooled feature map
n : number of input points
c : number of channels
n_intervals : number of unique points
geom_feats : input features, FloatTensor[n, c]
geom_coords : input coordinates, IntTensor[n, 4] 4: (x_id, y_id, z_id, batch_id)
interval_starts : how many points in each pooled point, IntTensor[n_intervals]
interval_lengths : starting position for pooled point, IntTensor[n_intervals]
out : output features, FloatTensor[b, d, h, w, c]
*/
__global__ void bev_sum_pool_kernel(int b, int d, int h, int w, int n, int c, int n_intervals,
const float *__restrict__ geom_feats,
const int *__restrict__ geom_coords,
const int *__restrict__ interval_starts,
const int *__restrict__ interval_lengths,
float* __restrict__ out) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int index = idx / c;
int cur_c = idx % c;
if (index >= n_intervals) return;
int interval_start = interval_starts[index];
int interval_length = interval_lengths[index];
const int* cur_geom_coords = geom_coords + interval_start * 4; // 当前负责计算的pillar的坐标 4: (x_id, y_id, z_id, batch_id)
const float* cur_geom_feats = geom_feats + interval_start * c + cur_c;
float* cur_out = out + cur_geom_coords[3] * d * h * w * c +
cur_geom_coords[2] * h * w * c + cur_geom_coords[1] * w * c +
cur_geom_coords[0] * c + cur_c;
float psum = 0;
for(int i = 0; i < interval_length; i++){
psum += cur_geom_feats[i * c];
}
*cur_out = psum;
}
/*
Function: pillar pooling backward
Args:
b : batch size
d : depth of the feature map
h : height of pooled feature map
w : width of pooled feature map
n : number of input points
c : number of channels
n_intervals : number of unique points
out_grad : gradient of the BEV fmap from top, FloatTensor[b, d, h, w, c]
geom_coords : input coordinates, IntTensor[N, 4] 4: (x_id, y_id, z_id, batch_id)
interval_lengths : how many points in each pooled point, IntTensor[n_intervals]
interval_starts : starting position for pooled point, IntTensor[n_intervals]
x_grad : gradient of the image fmap, FloatTensor
*/
__global__ void bev_sum_pool_grad_kernel(int b, int d, int h, int w, int n, int c, int n_intervals,
const float *__restrict__ out_grad,
const int *__restrict__ geom_coords,
const int *__restrict__ interval_starts,
const int *__restrict__ interval_lengths,
float* __restrict__ x_grad) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int index = idx / c;
int cur_c = idx % c;
if (index >= n_intervals) return;
int interval_start = interval_starts[index];
int interval_length = interval_lengths[index];
// 当前负责计算的pillar的坐标 4: (x_id, y_id, z_id, batch_id)
// 该pillar中所有点的梯度 与 该pillar特征的梯度相同.
const int* cur_geom_coords = geom_coords + interval_start * 4;
float* cur_x_grad = x_grad + interval_start * c + cur_c;
const float* cur_out_grad = out_grad + cur_geom_coords[3] * d * h * w * c +
cur_geom_coords[2] * h * w * c + cur_geom_coords[1] * w * c +
cur_geom_coords[0] * c + cur_c;
for(int i = 0; i < interval_length; i++){
cur_x_grad[i * c] = *cur_out_grad;
}
}
void bev_sum_pool(int b, int d, int h, int w, int n, int c, int n_intervals, const float* geom_feats,
const int* geom_coords, const int* interval_starts, const int* interval_lengths, float* out) {
bev_sum_pool_kernel<<<(int)ceil(((double)n_intervals * c / 256)), 256>>>(
b, d, h, w, n, c, n_intervals, geom_feats, geom_coords, interval_starts, interval_lengths, out
);
}
void bev_sum_pool_grad(int b, int d, int h, int w, int n, int c, int n_intervals, const float* out_grad,
const int* geom_coords, const int* interval_starts, const int* interval_lengths, float* x_grad) {
bev_sum_pool_grad_kernel<<<(int)ceil(((double)n_intervals * c / 256)), 256>>>(
b, d, h, w, n, c, n_intervals, out_grad, geom_coords, interval_starts, interval_lengths, x_grad
);
}
# Copyright (c) Phigent Robotics. All rights reserved.
from .bev_pool import bev_pool_v2, TRTBEVPoolv2
\ No newline at end of file
# Copyright (c) Phigent Robotics. All rights reserved.
import numpy as np
import torch
from . import bev_pool_v2_ext
__all__ = ['bev_pool_v2', 'TRTBEVPoolv2']
class QuickCumsumCuda(torch.autograd.Function):
r"""BEVPoolv2 implementation for Lift-Splat-Shoot view transformation.
Please refer to the `paper <https://arxiv.org/abs/2211.17111>`_
"""
@staticmethod
def forward(ctx, depth, feat, ranks_depth, ranks_feat, ranks_bev,
bev_feat_shape, interval_starts, interval_lengths):
ranks_bev = ranks_bev.int() # (N_points, ),
depth = depth.contiguous().float() # (B, N, D, fH, fW)
feat = feat.contiguous().float() # (B, N, fH, fW, C)
ranks_depth = ranks_depth.contiguous().int() # (N_points, ),
ranks_feat = ranks_feat.contiguous().int() # (N_points, ),
interval_lengths = interval_lengths.contiguous().int() # (N_pillar, )
interval_starts = interval_starts.contiguous().int() # (N_pillar, )
out = feat.new_zeros(bev_feat_shape) # (B, D_Z, D_Y, D_X, C)
bev_pool_v2_ext.bev_pool_v2_forward(
depth,
feat,
out,
ranks_depth,
ranks_feat,
ranks_bev,
interval_lengths,
interval_starts,
)
ctx.save_for_backward(ranks_bev, depth, feat, ranks_feat, ranks_depth)
return out
@staticmethod
def backward(ctx, out_grad):
ranks_bev, depth, feat, ranks_feat, ranks_depth = ctx.saved_tensors
order = ranks_feat.argsort()
ranks_feat, ranks_depth, ranks_bev = \
ranks_feat[order], ranks_depth[order], ranks_bev[order]
kept = torch.ones(
ranks_bev.shape[0], device=ranks_bev.device, dtype=torch.bool)
kept[1:] = ranks_feat[1:] != ranks_feat[:-1]
interval_starts_bp = torch.where(kept)[0].int()
interval_lengths_bp = torch.zeros_like(interval_starts_bp)
interval_lengths_bp[:-1] = interval_starts_bp[
1:] - interval_starts_bp[:-1]
interval_lengths_bp[-1] = ranks_bev.shape[0] - interval_starts_bp[-1]
depth = depth.contiguous()
feat = feat.contiguous()
ranks_depth = ranks_depth.contiguous()
ranks_feat = ranks_feat.contiguous()
ranks_bev = ranks_bev.contiguous()
interval_lengths_bp = interval_lengths_bp.contiguous()
interval_starts_bp = interval_starts_bp.contiguous()
depth_grad = depth.new_zeros(depth.shape)
feat_grad = feat.new_zeros(feat.shape)
out_grad = out_grad.contiguous()
bev_pool_v2_ext.bev_pool_v2_backward(
out_grad,
depth_grad,
feat_grad,
depth,
feat,
ranks_depth,
ranks_feat,
ranks_bev,
interval_lengths_bp,
interval_starts_bp,
)
return depth_grad, feat_grad, None, None, None, None, None, \
None, None, None
def bev_pool_v2(depth, feat, ranks_depth, ranks_feat, ranks_bev,
bev_feat_shape, interval_starts, interval_lengths):
"""
Args:
depth: (B, N, D, fH, fW)
feat: (B, N, fH, fW, C)
ranks_depth: (N_points, ),
ranks_feat: (N_points, ),
ranks_bev: (N_points, ),
bev_feat_shape: (B, D_Z, D_Y, D_X, C)
interval_starts: (N_pillar, )
interval_lengths: (N_pillar, )
Returns:
x: bev feature in shape (B, C, Dz, Dy, Dx)
"""
x = QuickCumsumCuda.apply(depth, feat, ranks_depth, ranks_feat, ranks_bev,
bev_feat_shape, interval_starts,
interval_lengths) # (B, Dz, Dy, Dx, C)
x = x.permute(0, 4, 1, 2, 3).contiguous() # (B, C, Dz, Dy, Dx)
return x
class TRTBEVPoolv2(torch.autograd.Function):
@staticmethod
def symbolic(g,
depth,
feat,
ranks_depth,
ranks_feat,
ranks_bev,
interval_starts,
interval_lengths,
output_height=128,
output_width=128,
output_z=1):
"""symbolic function for creating onnx op."""
return g.op(
'mmdeploy::bev_pool_v2',
depth,
feat,
ranks_depth,
ranks_feat,
ranks_bev,
interval_starts,
interval_lengths,
output_height_i=output_height,
output_width_i=output_width,
output_z_i=output_z)
@staticmethod
def forward(g,
depth, # N,D,H,W
feat, # N,H,W,C
ranks_depth,
ranks_feat,
ranks_bev,
interval_starts,
interval_lengths,
output_height=128,
output_width=128,
output_z=1):
"""run forward."""
feat = feat.unsqueeze(0)
depth = depth.unsqueeze(0)
bev_feat_shape = (depth.shape[0], output_z, output_height, output_width,
feat.shape[-1]) # (B, Z, Y, X, C)
bev_feat = bev_pool_v2(depth, feat, ranks_depth, ranks_feat, ranks_bev,
bev_feat_shape, interval_starts,
interval_lengths)
if output_z == 1:
bev_feat = bev_feat.squeeze(2)
bev_feat = bev_feat.permute(0, 2, 3, 1)
return bev_feat
def test_bev_pool_v2():
depth = np.array([0.3, 0.4, 0.2, 0.1, 0.7, 0.6, 0.8, 0.9])
depth = torch.from_numpy(depth).float().cuda()
depth = depth.view(1, 1, 2, 2, 2).requires_grad_()
feat = torch.ones(
size=[1, 1, 2, 2, 2], dtype=torch.float,
device='cuda').requires_grad_()
ranks_depth = torch.from_numpy(np.array([0, 4, 1, 6])).int().cuda()
ranks_feat = torch.from_numpy(np.array([0, 0, 1, 2])).int().cuda()
ranks_bev = torch.from_numpy(np.array([0, 0, 1, 1])).int().cuda()
kept = torch.ones(
ranks_bev.shape[0], device=ranks_bev.device, dtype=torch.bool)
kept[1:] = ranks_bev[1:] != ranks_bev[:-1]
interval_starts = torch.where(kept)[0].int()
if len(interval_starts) == 0:
return None, None, None, None, None
interval_lengths = torch.zeros_like(interval_starts)
interval_lengths[:-1] = interval_starts[1:] - interval_starts[:-1]
interval_lengths[-1] = ranks_bev.shape[0] - interval_starts[-1]
bev_feat = bev_pool_v2(depth, feat, ranks_depth, ranks_feat, ranks_bev,
(1, 1, 2, 2, 2), interval_starts, interval_lengths)
loss = torch.sum(bev_feat)
loss.backward()
assert loss == 4.4
grad_depth = np.array([2., 2., 0., 0., 2., 0., 2., 0.])
grad_depth = torch.from_numpy(grad_depth).float()
grad_depth = grad_depth.cuda().view(1, 1, 2, 2, 2)
assert depth.grad.allclose(grad_depth)
grad_feat = np.array([1.0, 1.0, 0.4, 0.4, 0.8, 0.8, 0., 0.])
grad_feat = torch.from_numpy(grad_feat).float().cuda().view(1, 1, 2, 2, 2)
assert feat.grad.allclose(grad_feat)
// Copyright (c) Phigent Robotics. All rights reserved.
// Reference https://arxiv.org/abs/2211.17111
#include <torch/torch.h>
#include <c10/cuda/CUDAGuard.h>
// CUDA function declarations
void bev_pool_v2(int c, int n_intervals, const float* depth, const float* feat,
const int* ranks_depth, const int* ranks_feat, const int* ranks_bev,
const int* interval_starts, const int* interval_lengths, float* out);
void bev_pool_v2_grad(int c, int n_intervals, const float* out_grad,
const float* depth, const float* feat, const int* ranks_depth, const int* ranks_feat,
const int* ranks_bev, const int* interval_starts, const int* interval_lengths,
float* depth_grad, float* feat_grad);
/*
Function: pillar pooling (forward, cuda)
Args:
depth : input depth, FloatTensor[n, d, h, w]
feat : input features, FloatTensor[n, h, w, c]
out : output features, FloatTensor[b, c, h_out, w_out]
ranks_depth : depth index of points, IntTensor[n_points]
ranks_feat : feat index of points, IntTensor[n_points]
ranks_bev : output index of points, IntTensor[n_points]
interval_lengths : starting position for pooled point, IntTensor[n_intervals]
interval_starts : how many points in each pooled point, IntTensor[n_intervals]
Return:
*/
void bev_pool_v2_forward(
const at::Tensor _depth, // (B, N, D, fH, fW)
const at::Tensor _feat, // (B, N, fH, fW, C)
at::Tensor _out, // (B, D_Z, D_Y, D_X, C)
const at::Tensor _ranks_depth, // (N_points, ),
const at::Tensor _ranks_feat, // (N_points, ),
const at::Tensor _ranks_bev, // (N_points, ),
const at::Tensor _interval_lengths, // (N_pillar, )
const at::Tensor _interval_starts // (N_pillar, )
) {
int c = _feat.size(4);
int n_intervals = _interval_lengths.size(0);
const at::cuda::OptionalCUDAGuard device_guard(device_of(_depth));
const float* depth = _depth.data_ptr<float>();
const float* feat = _feat.data_ptr<float>();
const int* ranks_depth = _ranks_depth.data_ptr<int>();
const int* ranks_feat = _ranks_feat.data_ptr<int>();
const int* ranks_bev = _ranks_bev.data_ptr<int>();
const int* interval_lengths = _interval_lengths.data_ptr<int>();
const int* interval_starts = _interval_starts.data_ptr<int>();
float* out = _out.data_ptr<float>();
bev_pool_v2(
c, n_intervals, depth, feat, ranks_depth, ranks_feat,
ranks_bev, interval_starts, interval_lengths, out
);
}
/*
Function: pillar pooling (backward, cuda)
Args:
out_grad : grad of output bev feature, FloatTensor[b, c, h_out, w_out]
depth_grad : grad of input depth, FloatTensor[n, d, h, w]
feat_grad : grad of input feature, FloatTensor[n, h, w, c]
depth : input depth, FloatTensor[n, d, h, w]
feat : input features, FloatTensor[n, h, w, c]
ranks_depth : depth index of points, IntTensor[n_points]
ranks_feat : feat index of points, IntTensor[n_points]
ranks_bev : output index of points, IntTensor[n_points]
interval_lengths : starting position for pooled point, IntTensor[n_intervals]
interval_starts : how many points in each pooled point, IntTensor[n_intervals]
*/
void bev_pool_v2_backward(
const at::Tensor _out_grad,
at::Tensor _depth_grad,
at::Tensor _feat_grad,
const at::Tensor _depth,
const at::Tensor _feat,
const at::Tensor _ranks_depth,
const at::Tensor _ranks_feat,
const at::Tensor _ranks_bev,
const at::Tensor _interval_lengths,
const at::Tensor _interval_starts
) {
int c = _out_grad.size(4);
int n_intervals = _interval_lengths.size(0);
const at::cuda::OptionalCUDAGuard device_guard(device_of(_out_grad));
const float* out_grad = _out_grad.data_ptr<float>();
float* depth_grad = _depth_grad.data_ptr<float>();
float* feat_grad = _feat_grad.data_ptr<float>();
const float* depth = _depth.data_ptr<float>();
const float* feat = _feat.data_ptr<float>();
const int* ranks_depth = _ranks_depth.data_ptr<int>();
const int* ranks_feat = _ranks_feat.data_ptr<int>();
const int* ranks_bev = _ranks_bev.data_ptr<int>();
const int* interval_lengths = _interval_lengths.data_ptr<int>();
const int* interval_starts = _interval_starts.data_ptr<int>();
bev_pool_v2_grad(
c, n_intervals, out_grad, depth, feat, ranks_depth, ranks_feat,
ranks_bev, interval_starts, interval_lengths, depth_grad, feat_grad
);
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("bev_pool_v2_forward", &bev_pool_v2_forward,
"bev_pool_v2_forward");
m.def("bev_pool_v2_backward", &bev_pool_v2_backward,
"bev_pool_v2_backward");
}
// Copyright (c) Phigent Robotics. All rights reserved.
// Reference https://arxiv.org/abs/2211.17111
#include <stdio.h>
#include <stdlib.h>
/*
Function: pillar pooling
Args:
c : number of channels
n_intervals : number of unique points
depth : input depth, FloatTensor[b,n,d,h,w]
feat : input feat, FloatTensor[b,n,h,w,c]
ranks_depth : input index of depth, IntTensor[n]
ranks_feat : input index of feat, IntTensor[n]
ranks_bev : output index, IntTensor[n]
interval_lengths : starting position for pooled point, IntTensor[n_intervals]
interval_starts : how many points in each pooled point, IntTensor[n_intervals]
out : output features, FloatTensor[b, d, h, w, c]
*/
__global__ void bev_pool_v2_kernel(int c, int n_intervals,
const float *__restrict__ depth,
const float *__restrict__ feat,
const int *__restrict__ ranks_depth,
const int *__restrict__ ranks_feat,
const int *__restrict__ ranks_bev,
const int *__restrict__ interval_starts,
const int *__restrict__ interval_lengths,
float* __restrict__ out) {
int idx = blockIdx.x * blockDim.x + threadIdx.x; // 该pillar的cur_c特征对应的索引.
int index = idx / c; // pillar id
int cur_c = idx % c; // channel id
if (index >= n_intervals) return;
int interval_start = interval_starts[index]; // 该pillar的起始索引.
int interval_length = interval_lengths[index]; // 该pillar的包含的点数量.
float psum = 0;
const float* cur_depth;
const float* cur_feat;
for(int i = 0; i < interval_length; i++){
// ranks_depth[interval_start+i]: depth索引, 介于(0, B*N*D*fH*fW-1)之间.
cur_depth = depth + ranks_depth[interval_start+i];
// ranks_feat[interval_start+i]: feature索引, 介于(0, B*N*fH*fW-1)之间.
cur_feat = feat + ranks_feat[interval_start+i] * c + cur_c; //
psum += *cur_feat * *cur_depth; // 聚合该pillar对应的cur_c特征.
}
const int* cur_rank = ranks_bev + interval_start; // 该pillar在BEV grids中对应的索引.
float* cur_out = out + *cur_rank * c + cur_c; // 该cur_c特征对应的索引位置.
*cur_out = psum;
}
/*
Function: pillar pooling backward
Args:
c : number of channels
n_intervals : number of unique points
out_grad : gradient of the BEV fmap from top, FloatTensor[b, d, h, w, c]
depth : input depth, FloatTensor[b,n,d,h,w]
feat : input feat, FloatTensor[b,n,h,w,c]
ranks_depth : input index of depth, IntTensor[n]
ranks_feat : input index of feat, IntTensor[n]
ranks_bev : output index, IntTensor[n]
interval_lengths : starting position for pooled point, IntTensor[n_intervals]
interval_starts : how many points in each pooled point, IntTensor[n_intervals]
depth_grad : gradient of the depth fmap, FloatTensor
feat_grad : gradient of the feature fmap, FloatTensor
*/
__global__ void bev_pool_grad_kernel(int c, int n_intervals,
const float *__restrict__ out_grad,
const float *__restrict__ depth,
const float *__restrict__ feat,
const int *__restrict__ ranks_depth,
const int *__restrict__ ranks_feat,
const int *__restrict__ ranks_bev,
const int *__restrict__ interval_starts,
const int *__restrict__ interval_lengths,
float* __restrict__ depth_grad,
float* __restrict__ feat_grad) {
int idx = blockIdx.x * blockDim.x + threadIdx.x; // 该pillar对应的thread
if (idx >= n_intervals) return;
int interval_start = interval_starts[idx]; // 该pillar的起始索引.
int interval_length = interval_lengths[idx]; // 该pillar的包含的点数量.
const int* cur_rank;
const float* cur_out_grad;
const float* cur_out_grad_start;
const float* cur_feat;
const float* cur_feat_start;
float* cur_depth_grad;
float grad_sum;
for(int i = 0; i < interval_length; i++){
cur_rank = ranks_bev + interval_start + i; // 该pillar在BEV grids中对应的索引.
cur_out_grad_start = out_grad + * cur_rank * c; // pillar feature 的 grad.
cur_feat_start = feat + ranks_feat[interval_start+i] * c;
grad_sum = 0;
for(int cur_c = 0; cur_c < c; cur_c++){
cur_out_grad = cur_out_grad_start + cur_c;
cur_feat = cur_feat_start + cur_c;
grad_sum += *cur_out_grad * *cur_feat;
}
cur_depth_grad = depth_grad + ranks_depth[interval_start+i];
*cur_depth_grad = grad_sum;
}
float* cur_feat_grad;
const float* cur_depth;
for(int cur_c = 0; cur_c < c; cur_c++){
grad_sum = 0;
for(int i = 0; i < interval_length; i++){
cur_rank = ranks_bev + interval_start + i;
cur_out_grad = out_grad + *cur_rank * c + cur_c;
cur_depth = depth + ranks_depth[interval_start+i];
grad_sum += *cur_out_grad * *cur_depth;
}
cur_feat_grad = feat_grad + ranks_feat[interval_start] * c + cur_c ;
* cur_feat_grad = grad_sum;
}
}
void bev_pool_v2(int c, int n_intervals, const float* depth, const float* feat, const int* ranks_depth,
const int* ranks_feat, const int* ranks_bev, const int* interval_starts, const int* interval_lengths, float* out) {
bev_pool_v2_kernel<<<(int)ceil(((double)n_intervals * c / 256)), 256>>>(
c, n_intervals, depth, feat, ranks_depth, ranks_feat,
ranks_bev, interval_starts, interval_lengths, out
);
}
void bev_pool_v2_grad(int c, int n_intervals, const float* out_grad,
const float* depth, const float* feat, const int* ranks_depth, const int* ranks_feat,
const int* ranks_bev, const int* interval_starts, const int* interval_lengths, float* depth_grad, float* feat_grad) {
bev_pool_grad_kernel<<<(int)ceil(((double)n_intervals / 256)), 256>>>(
c, n_intervals, out_grad, depth, feat, ranks_depth, ranks_feat,
ranks_bev, interval_starts, interval_lengths, depth_grad, feat_grad
);
}
# Copyright (c) Phigent Robotics. All rights reserved.
from .nearest_assign import nearest_assign
\ No newline at end of file
# Copyright (c) Phigent Robotics. All rights reserved.
import numpy as np
import torch
from . import nearest_assign_ext
__all__ = ['nearest_assign']
class QuickNearestAssignCuda(torch.autograd.Function):
@staticmethod
def forward(ctx,
occ_pred,
l2s_key,
occind2detind,
inst_cls,
inst_xyz,
inst_id_list,
):
occ_pred = occ_pred.contiguous().int()
l2s_key = l2s_key.contiguous().int()
occind2detind = occind2detind.contiguous().int()
inst_cls = inst_cls.contiguous().int()
inst_xyz = inst_xyz.contiguous().int()
inst_id_list = inst_id_list.contiguous().int()
inst_pred = occ_pred.new_zeros(occ_pred.shape)
nearest_assign_ext.nearest_assign_forward(
occ_pred,
l2s_key,
occind2detind,
inst_cls,
inst_xyz,
inst_id_list,
inst_pred
)
return inst_pred
def nearest_assign(occ_pred,
l2s_key,
occind2detind,
inst_cls,
inst_xyz,
inst_id_list):
inst_pred = QuickNearestAssignCuda.apply(occ_pred,
l2s_key,
occind2detind,
inst_cls,
inst_xyz,
inst_id_list
) # (B, Dz, Dy, Dx, C)
return inst_pred
def test_bev_pool_v2():
depth = np.array([0.3, 0.4, 0.2, 0.1, 0.7, 0.6, 0.8, 0.9])
depth = torch.from_numpy(depth).float().cuda()
depth = depth.view(1, 1, 2, 2, 2).requires_grad_()
feat = torch.ones(
size=[1, 1, 2, 2, 2], dtype=torch.float,
device='cuda').requires_grad_()
ranks_depth = torch.from_numpy(np.array([0, 4, 1, 6])).int().cuda()
ranks_feat = torch.from_numpy(np.array([0, 0, 1, 2])).int().cuda()
ranks_bev = torch.from_numpy(np.array([0, 0, 1, 1])).int().cuda()
kept = torch.ones(
ranks_bev.shape[0], device=ranks_bev.device, dtype=torch.bool)
kept[1:] = ranks_bev[1:] != ranks_bev[:-1]
interval_starts = torch.where(kept)[0].int()
if len(interval_starts) == 0:
return None, None, None, None, None
interval_lengths = torch.zeros_like(interval_starts)
interval_lengths[:-1] = interval_starts[1:] - interval_starts[:-1]
interval_lengths[-1] = ranks_bev.shape[0] - interval_starts[-1]
bev_feat = bev_pool_v2(depth, feat, ranks_depth, ranks_feat, ranks_bev,
(1, 1, 2, 2, 2), interval_starts, interval_lengths)
loss = torch.sum(bev_feat)
loss.backward()
assert loss == 4.4
grad_depth = np.array([2., 2., 0., 0., 2., 0., 2., 0.])
grad_depth = torch.from_numpy(grad_depth).float()
grad_depth = grad_depth.cuda().view(1, 1, 2, 2, 2)
assert depth.grad.allclose(grad_depth)
grad_feat = np.array([1.0, 1.0, 0.4, 0.4, 0.8, 0.8, 0., 0.])
grad_feat = torch.from_numpy(grad_feat).float().cuda().view(1, 1, 2, 2, 2)
assert feat.grad.allclose(grad_feat)
// Copyright (c) Phigent Robotics. All rights reserved.
// Reference https://arxiv.org/abs/2211.17111
#include <torch/torch.h>
#include <c10/cuda/CUDAGuard.h>
// CUDA function declarations
void nearest_assign(
const int* l2s_key,
int l2s_size,
const int *__restrict__ occind2detind,
int inst_size,
const int *__restrict__ occ_pred,
const int *__restrict__ inst_xyz,
const int *__restrict__ inst_cls,
const int *__restrict__ inst_id_list,
int* __restrict__ inst_pred);
void nearest_assign_forward(
const at::Tensor _occ_pred, // (200, 200, 16)
const at::Tensor _l2s_key, // (l2s_size, 1)
const at::Tensor _occind2detind, // (10, 1)
const at::Tensor _inst_cls, // (inst_size, 1)
const at::Tensor _inst_xyz, // (inst_size, 3)
const at::Tensor _inst_id_list, // (inst_size, 1)
at::Tensor _inst_pred // (200, 200, 16)
) {
int l2s_size = _l2s_key.size(0);
int inst_size = _inst_xyz.size(0);
const at::cuda::OptionalCUDAGuard device_guard(device_of(_occ_pred));
const int* occ_pred = _occ_pred.data_ptr<int>();
const int* inst_xyz = _inst_xyz.data_ptr<int>();
const int* inst_cls = _inst_cls.data_ptr<int>();
const int* l2s_key = _l2s_key.data_ptr<int>();
const int* inst_id_list = _inst_id_list.data_ptr<int>();
const int* occind2detind = _occind2detind.data_ptr<int>();
// std::map<int, int> l2s;
// for (int l2s_ind = 0; l2s_ind < l2s_size; l2s_ind++){
// l2s.insert(pair<int, int>(l2s_key[l2s_ind], l2s_val[l2s_ind]));
// }
int* inst_pred = _inst_pred.data_ptr<int>();
nearest_assign(
l2s_key,
l2s_size,
occind2detind,
inst_size,
occ_pred,
inst_xyz,
inst_cls,
inst_id_list,
inst_pred
);
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("nearest_assign_forward", &nearest_assign_forward,
"nearest_assign_forward");
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment