Commit 0a8030b0 authored by yukang.chen's avatar yukang.chen
Browse files

first commit

parent 519b1564
File added
...@@ -5,7 +5,7 @@ from ...utils import common_utils ...@@ -5,7 +5,7 @@ from ...utils import common_utils
from ...utils import box_utils from ...utils import box_utils
def random_flip_along_x(gt_boxes, points): def random_flip_along_x(gt_boxes, points, return_flip=False):
""" """
Args: Args:
gt_boxes: (N, 7 + C), [x, y, z, dx, dy, dz, heading, [vx], [vy]] gt_boxes: (N, 7 + C), [x, y, z, dx, dy, dz, heading, [vx], [vy]]
...@@ -20,11 +20,12 @@ def random_flip_along_x(gt_boxes, points): ...@@ -20,11 +20,12 @@ def random_flip_along_x(gt_boxes, points):
if gt_boxes.shape[1] > 7: if gt_boxes.shape[1] > 7:
gt_boxes[:, 8] = -gt_boxes[:, 8] gt_boxes[:, 8] = -gt_boxes[:, 8]
if return_flip:
return gt_boxes, points, enable
return gt_boxes, points return gt_boxes, points
def random_flip_along_y(gt_boxes, points): def random_flip_along_y(gt_boxes, points, return_flip=False):
""" """
Args: Args:
gt_boxes: (N, 7 + C), [x, y, z, dx, dy, dz, heading, [vx], [vy]] gt_boxes: (N, 7 + C), [x, y, z, dx, dy, dz, heading, [vx], [vy]]
...@@ -39,11 +40,12 @@ def random_flip_along_y(gt_boxes, points): ...@@ -39,11 +40,12 @@ def random_flip_along_y(gt_boxes, points):
if gt_boxes.shape[1] > 7: if gt_boxes.shape[1] > 7:
gt_boxes[:, 7] = -gt_boxes[:, 7] gt_boxes[:, 7] = -gt_boxes[:, 7]
if return_flip:
return gt_boxes, points, enable
return gt_boxes, points return gt_boxes, points
def global_rotation(gt_boxes, points, rot_range): def global_rotation(gt_boxes, points, rot_range, return_rot=False):
""" """
Args: Args:
gt_boxes: (N, 7 + C), [x, y, z, dx, dy, dz, heading, [vx], [vy]] gt_boxes: (N, 7 + C), [x, y, z, dx, dy, dz, heading, [vx], [vy]]
...@@ -61,10 +63,12 @@ def global_rotation(gt_boxes, points, rot_range): ...@@ -61,10 +63,12 @@ def global_rotation(gt_boxes, points, rot_range):
np.array([noise_rotation]) np.array([noise_rotation])
)[0][:, 0:2] )[0][:, 0:2]
if return_rot:
return gt_boxes, points, noise_rotation
return gt_boxes, points return gt_boxes, points
def global_scaling(gt_boxes, points, scale_range): def global_scaling(gt_boxes, points, scale_range, return_scale=False):
""" """
Args: Args:
gt_boxes: (N, 7), [x, y, z, dx, dy, dz, heading] gt_boxes: (N, 7), [x, y, z, dx, dy, dz, heading]
...@@ -77,7 +81,8 @@ def global_scaling(gt_boxes, points, scale_range): ...@@ -77,7 +81,8 @@ def global_scaling(gt_boxes, points, scale_range):
noise_scale = np.random.uniform(scale_range[0], scale_range[1]) noise_scale = np.random.uniform(scale_range[0], scale_range[1])
points[:, :3] *= noise_scale points[:, :3] *= noise_scale
gt_boxes[:, :6] *= noise_scale gt_boxes[:, :6] *= noise_scale
if return_scale:
return gt_boxes, points, noise_scale
return gt_boxes, points return gt_boxes, points
......
...@@ -46,9 +46,10 @@ class DataAugmentor(object): ...@@ -46,9 +46,10 @@ class DataAugmentor(object):
gt_boxes, points = data_dict['gt_boxes'], data_dict['points'] gt_boxes, points = data_dict['gt_boxes'], data_dict['points']
for cur_axis in config['ALONG_AXIS_LIST']: for cur_axis in config['ALONG_AXIS_LIST']:
assert cur_axis in ['x', 'y'] assert cur_axis in ['x', 'y']
gt_boxes, points = getattr(augmentor_utils, 'random_flip_along_%s' % cur_axis)( gt_boxes, points, enable = getattr(augmentor_utils, 'random_flip_along_%s' % cur_axis)(
gt_boxes, points, gt_boxes, points, return_flip=True
) )
data_dict['flip_%s'%cur_axis] = enable
data_dict['gt_boxes'] = gt_boxes data_dict['gt_boxes'] = gt_boxes
data_dict['points'] = points data_dict['points'] = points
...@@ -60,23 +61,25 @@ class DataAugmentor(object): ...@@ -60,23 +61,25 @@ class DataAugmentor(object):
rot_range = config['WORLD_ROT_ANGLE'] rot_range = config['WORLD_ROT_ANGLE']
if not isinstance(rot_range, list): if not isinstance(rot_range, list):
rot_range = [-rot_range, rot_range] rot_range = [-rot_range, rot_range]
gt_boxes, points = augmentor_utils.global_rotation( gt_boxes, points, noise_rot = augmentor_utils.global_rotation(
data_dict['gt_boxes'], data_dict['points'], rot_range=rot_range data_dict['gt_boxes'], data_dict['points'], rot_range=rot_range, return_rot=True
) )
data_dict['gt_boxes'] = gt_boxes data_dict['gt_boxes'] = gt_boxes
data_dict['points'] = points data_dict['points'] = points
data_dict['noise_rot'] = noise_rot
return data_dict return data_dict
def random_world_scaling(self, data_dict=None, config=None): def random_world_scaling(self, data_dict=None, config=None):
if data_dict is None: if data_dict is None:
return partial(self.random_world_scaling, config=config) return partial(self.random_world_scaling, config=config)
gt_boxes, points = augmentor_utils.global_scaling( gt_boxes, points, noise_scale = augmentor_utils.global_scaling(
data_dict['gt_boxes'], data_dict['points'], config['WORLD_SCALE_RANGE'] data_dict['gt_boxes'], data_dict['points'], config['WORLD_SCALE_RANGE'], return_scale=True
) )
data_dict['gt_boxes'] = gt_boxes data_dict['gt_boxes'] = gt_boxes
data_dict['points'] = points data_dict['points'] = points
data_dict['noise_scale'] = noise_scale
return data_dict return data_dict
def random_image_flip(self, data_dict=None, config=None): def random_image_flip(self, data_dict=None, config=None):
......
...@@ -3,18 +3,27 @@ import pickle ...@@ -3,18 +3,27 @@ import pickle
import os import os
import copy import copy
import numpy as np import numpy as np
from skimage import io
import torch
import SharedArray import SharedArray
import torch.distributed as dist import torch.distributed as dist
from ...ops.iou3d_nms import iou3d_nms_utils from ...ops.iou3d_nms import iou3d_nms_utils
from ...utils import box_utils, common_utils from ...utils import box_utils, common_utils, box2d_utils, calibration_kitti
from pcdet.datasets.kitti.kitti_object_eval_python import kitti_common
class DataBaseSampler(object): class DataBaseSampler(object):
def __init__(self, root_path, sampler_cfg, class_names, logger=None): def __init__(self, root_path, sampler_cfg, class_names, logger=None):
self.root_path = root_path self.root_path = root_path
self.class_names = class_names self.class_names = class_names
self.sampler_cfg = sampler_cfg self.sampler_cfg = sampler_cfg
self.aug_with_img = sampler_cfg.get('AUG_WITH_IMAGE', False)
self.joint_sample = sampler_cfg.get('JOINT_SAMPLE', False)
self.keep_raw = sampler_cfg.get('KEEP_RAW', False)
self.box_iou_thres = sampler_cfg.get('BOX_IOU_THRES', 1.0)
self.aug_use_type = sampler_cfg.get('AUG_USE_TYPE', 'annotation')
self.point_refine = sampler_cfg.get('POINT_REFINE', False)
self.logger = logger self.logger = logger
self.db_infos = {} self.db_infos = {}
for class_name in class_names: for class_name in class_names:
...@@ -153,19 +162,89 @@ class DataBaseSampler(object): ...@@ -153,19 +162,89 @@ class DataBaseSampler(object):
gt_boxes[:, 2] -= mv_height # lidar view gt_boxes[:, 2] -= mv_height # lidar view
return gt_boxes, mv_height return gt_boxes, mv_height
def add_sampled_boxes_to_scene(self, data_dict, sampled_gt_boxes, total_valid_sampled_dict): def copy_paste_to_image(self, data_dict, crop_feat, gt_number, point_idxes=None):
image = data_dict['images']
boxes3d = data_dict['gt_boxes']
boxes2d = data_dict['gt_boxes2d']
corners_lidar = box_utils.boxes_to_corners_3d(boxes3d)
img_aug_type = self.sampler_cfg.IMG_AUG_TYPE
if 'depth' in img_aug_type:
paste_order = boxes3d[:,0].argsort()
paste_order = paste_order[::-1]
else:
paste_order = np.arange(len(boxes3d),dtype=np.int)
if 'reverse' in img_aug_type:
paste_order = paste_order[::-1]
paste_mask = -255 * np.ones(image.shape[:2], dtype=np.int)
fg_mask = np.zeros(image.shape[:2], dtype=np.int)
overlap_mask = np.zeros(image.shape[:2], dtype=np.int)
depth_mask = np.zeros((*image.shape[:2], 2), dtype=np.float)
points_2d, depth_2d = data_dict['calib'].lidar_to_img(data_dict['points'][:,:3])
points_2d[:,0] = np.clip(points_2d[:,0], a_min=0, a_max=image.shape[1]-1)
points_2d[:,1] = np.clip(points_2d[:,1], a_min=0, a_max=image.shape[0]-1)
points_2d = points_2d.astype(np.int)
for _order in paste_order:
_box2d = boxes2d[_order]
image[_box2d[1]:_box2d[3],_box2d[0]:_box2d[2]] = crop_feat[_order]
overlap_mask[_box2d[1]:_box2d[3],_box2d[0]:_box2d[2]] += \
(paste_mask[_box2d[1]:_box2d[3],_box2d[0]:_box2d[2]] > 0).astype(np.int)
paste_mask[_box2d[1]:_box2d[3],_box2d[0]:_box2d[2]] = _order
if 'cover' in self.aug_use_type:
# HxWx2 for min and max depth of each box region
depth_mask[_box2d[1]:_box2d[3],_box2d[0]:_box2d[2],0] = corners_lidar[_order,:,0].min()
depth_mask[_box2d[1]:_box2d[3],_box2d[0]:_box2d[2],1] = corners_lidar[_order,:,0].max()
# foreground area of original point cloud in image plane
if _order < gt_number:
fg_mask[_box2d[1]:_box2d[3],_box2d[0]:_box2d[2]] = 1
data_dict['images'] = image
if not self.joint_sample:
return data_dict
new_mask = paste_mask[points_2d[:,1], points_2d[:,0]]==(point_idxes+gt_number)
if self.keep_raw:
raw_mask = point_idxes==-1
else:
raw_fg = (fg_mask == 1) & (paste_mask >= 0) & (paste_mask < gt_number)
raw_bg = (fg_mask == 0) & (paste_mask < 0)
raw_mask = raw_fg[points_2d[:,1], points_2d[:,0]] | raw_bg[points_2d[:,1], points_2d[:,0]]
keep_mask = new_mask | raw_mask
data_dict['points_2d'] = points_2d
if 'annotation' in self.aug_use_type:
data_dict['points'] = data_dict['points'][keep_mask]
data_dict['points_2d'] = data_dict['points_2d'][keep_mask]
elif 'projection' in self.aug_use_type:
overlap_mask[overlap_mask>=1] = 1
data_dict['overlap_mask'] = overlap_mask
if 'cover' in self.aug_use_type:
data_dict['depth_mask'] = depth_mask
return data_dict
def add_sampled_boxes_to_scene(self, data_dict, sampled_gt_boxes, mv_height, sampled_gt_boxes2d, total_valid_sampled_dict):
gt_boxes_mask = data_dict['gt_boxes_mask'] gt_boxes_mask = data_dict['gt_boxes_mask']
gt_boxes = data_dict['gt_boxes'][gt_boxes_mask] gt_boxes = data_dict['gt_boxes'][gt_boxes_mask]
gt_names = data_dict['gt_names'][gt_boxes_mask] gt_names = data_dict['gt_names'][gt_boxes_mask]
gt_number = gt_boxes_mask.sum().astype(np.int)
points = data_dict['points'] points = data_dict['points']
if self.sampler_cfg.get('USE_ROAD_PLANE', False): if self.sampler_cfg.get('USE_ROAD_PLANE', False) and not self.aug_with_img:
sampled_gt_boxes, mv_height = self.put_boxes_on_road_planes( sampled_gt_boxes, mv_height = self.put_boxes_on_road_planes(
sampled_gt_boxes, data_dict['road_plane'], data_dict['calib'] sampled_gt_boxes, data_dict['road_plane'], data_dict['calib']
) )
data_dict.pop('calib') data_dict.pop('calib')
data_dict.pop('road_plane') data_dict.pop('road_plane')
obj_points_list = [] obj_points_list, obj_index_list, crop_boxes2d = [], [], []
# convert sampled 3D boxes to image plane
if self.aug_with_img:
gt_boxes2d = data_dict['gt_boxes2d'][gt_boxes_mask].astype(np.int)
gt_crops2d = [data_dict['images'][_x[1]:_x[3],_x[0]:_x[2]] for _x in gt_boxes2d]
if self.use_shared_memory: if self.use_shared_memory:
gt_database_data = SharedArray.attach(f"shm://{self.gt_database_data_key}") gt_database_data = SharedArray.attach(f"shm://{self.gt_database_data_key}")
gt_database_data.setflags(write=0) gt_database_data.setflags(write=0)
...@@ -187,21 +266,78 @@ class DataBaseSampler(object): ...@@ -187,21 +266,78 @@ class DataBaseSampler(object):
# mv height # mv height
obj_points[:, 2] -= mv_height[idx] obj_points[:, 2] -= mv_height[idx]
if self.aug_with_img:
calib_file = kitti_common.get_calib_path(int(info['image_idx']), self.root_path, relative_path=False)
sampled_calib = calibration_kitti.Calibration(calib_file)
points_2d, depth_2d = sampled_calib.lidar_to_img(obj_points[:,:3])
if self.point_refine:
# align calibration metrics for points
points_ract = data_dict['calib'].img_to_rect(points_2d[:,0], points_2d[:,1], depth_2d)
points_lidar = data_dict['calib'].rect_to_lidar(points_ract)
obj_points[:, :3] = points_lidar
# align calibration metrics for boxes
box3d_raw = sampled_gt_boxes[idx].reshape(1,-1)
box3d_coords = box_utils.boxes_to_corners_3d(box3d_raw)[0]
box3d_box, box3d_depth = sampled_calib.lidar_to_img(box3d_coords)
box3d_coord_rect = data_dict['calib'].img_to_rect(box3d_box[:,0], box3d_box[:,1], box3d_depth)
box3d_rect = box_utils.corners_rect_to_camera(box3d_coord_rect).reshape(1,-1)
box3d_lidar = box_utils.boxes3d_kitti_camera_to_lidar(box3d_rect, data_dict['calib'])
box2d = box_utils.boxes3d_kitti_camera_to_imageboxes(box3d_rect, data_dict['calib'],
data_dict['images'].shape[:2])
sampled_gt_boxes[idx] = box3d_lidar[0]
sampled_gt_boxes2d[idx] = box2d[0]
obj_idx = idx * np.ones(len(obj_points), dtype=np.int)
obj_points_list.append(obj_points) obj_points_list.append(obj_points)
obj_index_list.append(obj_idx)
# copy crops from images
if self.aug_with_img:
img_path = self.root_path / self.sampler_cfg.IMG_ROOT_PATH / (info['image_idx']+'.png')
raw_image = io.imread(img_path)
raw_image = raw_image.astype(np.float32)
raw_center = info['bbox'].reshape(2,2).mean(0)
new_box = sampled_gt_boxes2d[idx].astype(np.int)
new_shape = np.array([new_box[2]-new_box[0], new_box[3]-new_box[1]])
raw_box = np.concatenate([raw_center-new_shape/2, raw_center+new_shape/2]).astype(np.int)
raw_box[0::2] = np.clip(raw_box[0::2], a_min=0, a_max=raw_image.shape[1])
raw_box[1::2] = np.clip(raw_box[1::2], a_min=0, a_max=raw_image.shape[0])
if (raw_box[2]-raw_box[0])!=new_shape[0] or (raw_box[3]-raw_box[1])!=new_shape[1]:
new_center = new_box.reshape(2,2).mean(0)
new_shape = np.array([raw_box[2]-raw_box[0], raw_box[3]-raw_box[1]])
new_box = np.concatenate([new_center-new_shape/2, new_center+new_shape/2]).astype(np.int)
img_crop2d = raw_image[raw_box[1]:raw_box[3],raw_box[0]:raw_box[2]] / 255
crop_boxes2d.append(new_box)
gt_crops2d.append(img_crop2d)
obj_points = np.concatenate(obj_points_list, axis=0) obj_points = np.concatenate(obj_points_list, axis=0)
obj_points_idx = np.concatenate(obj_index_list, axis=0)
sampled_gt_names = np.array([x['name'] for x in total_valid_sampled_dict]) sampled_gt_names = np.array([x['name'] for x in total_valid_sampled_dict])
large_sampled_gt_boxes = box_utils.enlarge_box3d( large_sampled_gt_boxes = box_utils.enlarge_box3d(
sampled_gt_boxes[:, 0:7], extra_width=self.sampler_cfg.REMOVE_EXTRA_WIDTH sampled_gt_boxes[:, 0:7], extra_width=self.sampler_cfg.REMOVE_EXTRA_WIDTH
) )
points = box_utils.remove_points_in_boxes3d(points, large_sampled_gt_boxes) points = box_utils.remove_points_in_boxes3d(points, large_sampled_gt_boxes)
points = np.concatenate([obj_points, points], axis=0) point_idxes = -1 * np.ones(len(points), dtype=np.int)
points = np.concatenate([points, obj_points], axis=0)
point_idxes = np.concatenate([point_idxes, obj_points_idx], axis=0)
gt_names = np.concatenate([gt_names, sampled_gt_names], axis=0) gt_names = np.concatenate([gt_names, sampled_gt_names], axis=0)
gt_boxes = np.concatenate([gt_boxes, sampled_gt_boxes], axis=0) gt_boxes = np.concatenate([gt_boxes, sampled_gt_boxes], axis=0)
data_dict['gt_boxes'] = gt_boxes data_dict['gt_boxes'] = gt_boxes
data_dict['gt_names'] = gt_names data_dict['gt_names'] = gt_names
data_dict['points'] = points data_dict['points'] = points
if self.aug_with_img:
data_dict['gt_boxes2d'] = np.concatenate([gt_boxes2d, np.array(crop_boxes2d)], axis=0)
data_dict = self.copy_paste_to_image(data_dict, gt_crops2d, gt_number, point_idxes)
if self.sampler_cfg.get('USE_ROAD_PLANE', False) and self.aug_with_img:
# data_dict.pop('calib')
data_dict.pop('road_plane')
return data_dict return data_dict
def __call__(self, data_dict): def __call__(self, data_dict):
...@@ -217,6 +353,8 @@ class DataBaseSampler(object): ...@@ -217,6 +353,8 @@ class DataBaseSampler(object):
gt_names = data_dict['gt_names'].astype(str) gt_names = data_dict['gt_names'].astype(str)
existed_boxes = gt_boxes existed_boxes = gt_boxes
total_valid_sampled_dict = [] total_valid_sampled_dict = []
sampled_mv_height = []
sampled_gt_boxes2d = []
for class_name, sample_group in self.sample_groups.items(): for class_name, sample_group in self.sample_groups.items():
if self.limit_whole_scene: if self.limit_whole_scene:
num_gt = np.sum(class_name == gt_names) num_gt = np.sum(class_name == gt_names)
...@@ -234,15 +372,48 @@ class DataBaseSampler(object): ...@@ -234,15 +372,48 @@ class DataBaseSampler(object):
iou2[range(sampled_boxes.shape[0]), range(sampled_boxes.shape[0])] = 0 iou2[range(sampled_boxes.shape[0]), range(sampled_boxes.shape[0])] = 0
iou1 = iou1 if iou1.shape[1] > 0 else iou2 iou1 = iou1 if iou1.shape[1] > 0 else iou2
valid_mask = ((iou1.max(axis=1) + iou2.max(axis=1)) == 0).nonzero()[0] valid_mask = ((iou1.max(axis=1) + iou2.max(axis=1)) == 0).nonzero()[0]
# filter out box2d iou > thres
if self.sampler_cfg.get('USE_ROAD_PLANE', False):
sampled_boxes, mv_height = self.put_boxes_on_road_planes(
sampled_boxes, data_dict['road_plane'], data_dict['calib']
)
if self.aug_with_img:
# sampled_boxes2d = np.stack([x['bbox'] for x in sampled_dict], axis=0).astype(np.float32)
boxes3d_camera = box_utils.boxes3d_lidar_to_kitti_camera(sampled_boxes, data_dict['calib'])
sampled_boxes2d = box_utils.boxes3d_kitti_camera_to_imageboxes(boxes3d_camera, data_dict['calib'],
data_dict['images'].shape[:2])
sampled_boxes2d = torch.Tensor(sampled_boxes2d)
existed_boxes2d = torch.Tensor(data_dict['gt_boxes2d'])
iou2d1 = box2d_utils.pairwise_iou(sampled_boxes2d, existed_boxes2d).cpu().numpy()
iou2d2 = box2d_utils.pairwise_iou(sampled_boxes2d, sampled_boxes2d).cpu().numpy()
iou2d2[range(sampled_boxes2d.shape[0]), range(sampled_boxes2d.shape[0])] = 0
iou2d1 = iou2d1 if iou2d1.shape[1] > 0 else iou2d2
valid_mask = ((iou2d1.max(axis=1)<self.box_iou_thres) &
(iou2d2.max(axis=1)<self.box_iou_thres) &
((iou1.max(axis=1) + iou2.max(axis=1)) == 0)).nonzero()[0]
sampled_boxes2d = sampled_boxes2d[valid_mask].cpu().numpy()
sampled_gt_boxes2d.append(sampled_boxes2d)
valid_sampled_dict = [sampled_dict[x] for x in valid_mask] valid_sampled_dict = [sampled_dict[x] for x in valid_mask]
valid_sampled_boxes = sampled_boxes[valid_mask] valid_sampled_boxes = sampled_boxes[valid_mask]
mv_height = mv_height[valid_mask]
existed_boxes = np.concatenate((existed_boxes, valid_sampled_boxes), axis=0) existed_boxes = np.concatenate((existed_boxes, valid_sampled_boxes), axis=0)
sampled_mv_height = np.concatenate((sampled_mv_height, mv_height), axis=0)
total_valid_sampled_dict.extend(valid_sampled_dict) total_valid_sampled_dict.extend(valid_sampled_dict)
sampled_gt_boxes = existed_boxes[gt_boxes.shape[0]:, :] sampled_gt_boxes = existed_boxes[gt_boxes.shape[0]:, :]
if len(sampled_gt_boxes2d) > 0:
sampled_gt_boxes2d = np.concatenate(sampled_gt_boxes2d, axis=0)
if total_valid_sampled_dict.__len__() > 0: if total_valid_sampled_dict.__len__() > 0:
data_dict = self.add_sampled_boxes_to_scene(data_dict, sampled_gt_boxes, total_valid_sampled_dict) data_dict = self.add_sampled_boxes_to_scene(data_dict,
sampled_gt_boxes,
sampled_mv_height,
sampled_gt_boxes2d,
total_valid_sampled_dict)
data_dict.pop('gt_boxes_mask') data_dict.pop('gt_boxes_mask')
return data_dict return data_dict
...@@ -9,7 +9,6 @@ from .augmentor.data_augmentor import DataAugmentor ...@@ -9,7 +9,6 @@ from .augmentor.data_augmentor import DataAugmentor
from .processor.data_processor import DataProcessor from .processor.data_processor import DataProcessor
from .processor.point_feature_encoder import PointFeatureEncoder from .processor.point_feature_encoder import PointFeatureEncoder
class DatasetTemplate(torch_data.Dataset): class DatasetTemplate(torch_data.Dataset):
def __init__(self, dataset_cfg=None, class_names=None, training=True, root_path=None, logger=None): def __init__(self, dataset_cfg=None, class_names=None, training=True, root_path=None, logger=None):
super().__init__() super().__init__()
...@@ -124,13 +123,14 @@ class DatasetTemplate(torch_data.Dataset): ...@@ -124,13 +123,14 @@ class DatasetTemplate(torch_data.Dataset):
assert 'gt_boxes' in data_dict, 'gt_boxes should be provided for training' assert 'gt_boxes' in data_dict, 'gt_boxes should be provided for training'
gt_boxes_mask = np.array([n in self.class_names for n in data_dict['gt_names']], dtype=np.bool_) gt_boxes_mask = np.array([n in self.class_names for n in data_dict['gt_names']], dtype=np.bool_)
calib = data_dict['calib']
data_dict = self.data_augmentor.forward( data_dict = self.data_augmentor.forward(
data_dict={ data_dict={
**data_dict, **data_dict,
'gt_boxes_mask': gt_boxes_mask 'gt_boxes_mask': gt_boxes_mask
} }
) )
data_dict['calib'] = calib
if data_dict.get('gt_boxes', None) is not None: if data_dict.get('gt_boxes', None) is not None:
selected = common_utils.keep_arrays_by_name(data_dict['gt_names'], self.class_names) selected = common_utils.keep_arrays_by_name(data_dict['gt_names'], self.class_names)
data_dict['gt_boxes'] = data_dict['gt_boxes'][selected] data_dict['gt_boxes'] = data_dict['gt_boxes'][selected]
...@@ -205,7 +205,7 @@ class DatasetTemplate(torch_data.Dataset): ...@@ -205,7 +205,7 @@ class DatasetTemplate(torch_data.Dataset):
pad_w = common_utils.get_pad_params(desired_size=max_w, cur_size=image.shape[1]) pad_w = common_utils.get_pad_params(desired_size=max_w, cur_size=image.shape[1])
pad_width = (pad_h, pad_w) pad_width = (pad_h, pad_w)
# Pad with nan, to be replaced later in the pipeline. # Pad with nan, to be replaced later in the pipeline.
pad_value = np.nan pad_value = 0 #np.nan
if key == "images": if key == "images":
pad_width = (pad_h, pad_w, (0, 0)) pad_width = (pad_h, pad_w, (0, 0))
...@@ -219,6 +219,20 @@ class DatasetTemplate(torch_data.Dataset): ...@@ -219,6 +219,20 @@ class DatasetTemplate(torch_data.Dataset):
images.append(image_pad) images.append(image_pad)
ret[key] = np.stack(images, axis=0) ret[key] = np.stack(images, axis=0)
elif key in ['calib']:
ret[key] = val
elif key in ["points_2d"]:
max_len = max([len(_val) for _val in val])
pad_value = 0
points = []
for _points in val:
pad_width = ((0, max_len-len(_points)), (0,0))
points_pad = np.pad(_points,
pad_width=pad_width,
mode='constant',
constant_values=pad_value)
points.append(points_pad)
ret[key] = np.stack(points, axis=0)
else: else:
ret[key] = np.stack(val, axis=0) ret[key] = np.stack(val, axis=0)
except: except:
......
...@@ -421,6 +421,7 @@ class KittiDataset(DatasetTemplate): ...@@ -421,6 +421,7 @@ class KittiDataset(DatasetTemplate):
if "calib_matricies" in get_item_list: if "calib_matricies" in get_item_list:
input_dict["trans_lidar_to_cam"], input_dict["trans_cam_to_img"] = kitti_utils.calib_to_matricies(calib) input_dict["trans_lidar_to_cam"], input_dict["trans_cam_to_img"] = kitti_utils.calib_to_matricies(calib)
input_dict['calib'] = calib
data_dict = self.prepare_data(data_dict=input_dict) data_dict = self.prepare_data(data_dict=input_dict)
data_dict['image_shape'] = img_shape data_dict['image_shape'] = img_shape
......
from .pointnet2_backbone import PointNet2Backbone, PointNet2MSG from .pointnet2_backbone import PointNet2Backbone, PointNet2MSG
from .spconv_backbone import VoxelBackBone8x, VoxelResBackBone8x from .spconv_backbone import VoxelBackBone8x, VoxelResBackBone8x
from .spconv_backbone_focal import VoxelBackBone8xFocal
from .spconv_unet import UNetV2 from .spconv_unet import UNetV2
__all__ = { __all__ = {
...@@ -8,4 +9,5 @@ __all__ = { ...@@ -8,4 +9,5 @@ __all__ = {
'PointNet2Backbone': PointNet2Backbone, 'PointNet2Backbone': PointNet2Backbone,
'PointNet2MSG': PointNet2MSG, 'PointNet2MSG': PointNet2MSG,
'VoxelResBackBone8x': VoxelResBackBone8x, 'VoxelResBackBone8x': VoxelResBackBone8x,
'VoxelBackBone8xFocal': VoxelBackBone8xFocal,
} }
import torch.nn as nn
class BasicBlock1D(nn.Module):
def __init__(self, in_channels, out_channels, **kwargs):
"""
Initializes convolutional block
Args:
in_channels: int, Number of input channels
out_channels: int, Number of output channels
**kwargs: Dict, Extra arguments for nn.Conv2d
"""
super().__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.conv = nn.Conv1d(in_channels=in_channels,
out_channels=out_channels,
**kwargs)
self.bn = nn.BatchNorm1d(out_channels)
self.relu = nn.ReLU(inplace=True)
def forward(self, features):
"""
Applies convolutional block
Args:
features: (B, C_in, H, W), Input features
Returns:
x: (B, C_out, H, W), Output features
"""
x = self.conv(features)
x = self.bn(x)
x = self.relu(x)
return x
class BasicBlock2D(nn.Module):
def __init__(self, in_channels, out_channels, **kwargs):
"""
Initializes convolutional block
Args:
in_channels: int, Number of input channels
out_channels: int, Number of output channels
**kwargs: Dict, Extra arguments for nn.Conv2d
"""
super().__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.conv = nn.Conv2d(in_channels=in_channels,
out_channels=out_channels,
**kwargs)
self.bn = nn.BatchNorm2d(out_channels)
self.relu = nn.ReLU(inplace=True)
def forward(self, features):
"""
Applies convolutional block
Args:
features: (B, C_in, H, W), Input features
Returns:
x: (B, C_out, H, W), Output features
"""
x = self.conv(features)
x = self.bn(x)
x = self.relu(x)
return x
import torch
import torch.nn as nn
from .basic_blocks import BasicBlock2D
from .sem_deeplabv3 import SemDeepLabV3
class PyramidFeat2D(nn.Module):
def __init__(self, optimize, model_cfg):
"""
Initialize 2D feature network via pretrained model
Args:
model_cfg: EasyDict, Dense classification network config
"""
super().__init__()
self.model_cfg = model_cfg
self.is_optimize = optimize
# Create modules
self.ifn = SemDeepLabV3(
num_classes=model_cfg.num_class,
backbone_name=model_cfg.backbone,
**model_cfg.args
)
self.reduce_blocks = torch.nn.ModuleList()
self.out_channels = {}
for _idx, _channel in enumerate(model_cfg.channel_reduce["in_channels"]):
_channel_out = model_cfg.channel_reduce["out_channels"][_idx]
self.out_channels[model_cfg.args['feat_extract_layer'][_idx]] = _channel_out
block_cfg = {"in_channels": _channel,
"out_channels": _channel_out,
"kernel_size": model_cfg.channel_reduce["kernel_size"][_idx],
"stride": model_cfg.channel_reduce["stride"][_idx],
"bias": model_cfg.channel_reduce["bias"][_idx]}
self.reduce_blocks.append(BasicBlock2D(**block_cfg))
def get_output_feature_dim(self):
return self.out_channels
def forward(self, images):
"""
Predicts depths and creates image depth feature volume using depth distributions
Args:
images: (N, 3, H_in, W_in), Input images
Returns:
batch_dict:
frustum_features: (N, C, D, H_out, W_out), Image depth features
"""
# Pixel-wise depth classification
batch_dict = {}
ifn_result = self.ifn(images)
for _idx, _layer in enumerate(self.model_cfg.args['feat_extract_layer']):
image_features = ifn_result[_layer]
# Channel reduce
if self.reduce_blocks[_idx] is not None:
image_features = self.reduce_blocks[_idx](image_features)
batch_dict[_layer+"_feat2d"] = image_features
if self.training:
# detach feature from graph if not optimize
if "logits" in ifn_result:
ifn_result["logits"].detach_()
if not self.is_optimize:
image_features.detach_()
return batch_dict
def get_loss(self):
"""
Gets loss
Args:
Returns:
loss: (1), Network loss
tb_dict: dict[float], All losses to log in tensorboard
"""
return None, None
from collections import OrderedDict
from pathlib import Path
from torch import hub
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from kornia.enhance.normalize import normalize
class SegTemplate(nn.Module):
def __init__(self, constructor, feat_extract_layer, num_classes, pretrained_path=None, aux_loss=None):
"""
Initializes depth distribution network.
Args:
constructor: function, Model constructor
feat_extract_layer: string, Layer to extract features from
num_classes: int, Number of classes
pretrained_path: string, (Optional) Path of the model to load weights from
aux_loss: bool, Flag to include auxillary loss
"""
super().__init__()
self.num_classes = num_classes
self.pretrained_path = pretrained_path
self.pretrained = pretrained_path is not None
self.aux_loss = aux_loss
if self.pretrained:
# Preprocess Module
self.norm_mean = torch.Tensor([0.485, 0.456, 0.406])
self.norm_std = torch.Tensor([0.229, 0.224, 0.225])
# Model
self.model = self.get_model(constructor=constructor)
self.feat_extract_layer = feat_extract_layer
return_layers = {_layer:_layer for _layer in feat_extract_layer}
self.model.backbone.return_layers.update(return_layers)
def get_model(self, constructor):
"""
Get model
Args:
constructor: function, Model constructor
Returns:
model: nn.Module, Model
"""
# Get model
model = constructor(pretrained=False,
pretrained_backbone=False,
num_classes=self.num_classes,
aux_loss=self.aux_loss)
# Update weights
if self.pretrained_path is not None:
model_dict = model.state_dict()
# Download pretrained model if not available yet
checkpoint_path = Path(self.pretrained_path)
if not checkpoint_path.exists():
checkpoint = checkpoint_path.name
save_dir = checkpoint_path.parent
save_dir.mkdir(parents=True, exist_ok=True)
url = f'https://download.pytorch.org/models/{checkpoint}'
hub.load_state_dict_from_url(url, save_dir)
# Get pretrained state dict
pretrained_dict = torch.load(self.pretrained_path)
#pretrained_dict = self.filter_pretrained_dict(model_dict=model_dict, pretrained_dict=pretrained_dict)
# Update current model state dict
model_dict.update(pretrained_dict)
model.load_state_dict(model_dict, strict=False)
return model.cuda()
def filter_pretrained_dict(self, model_dict, pretrained_dict):
"""
Removes layers from pretrained state dict that are not used or changed in model
Args:
model_dict: dict, Default model state dictionary
pretrained_dict: dict, Pretrained model state dictionary
Returns:
pretrained_dict: dict, Pretrained model state dictionary with removed weights
"""
# Removes aux classifier weights if not used
if "aux_classifier.0.weight" in pretrained_dict and "aux_classifier.0.weight" not in model_dict:
pretrained_dict = {key: value for key, value in pretrained_dict.items()
if "aux_classifier" not in key}
# Removes final conv layer from weights if number of classes are different
model_num_classes = model_dict["classifier.4.weight"].shape[0]
pretrained_num_classes = pretrained_dict["classifier.4.weight"].shape[0]
if model_num_classes != pretrained_num_classes:
pretrained_dict.pop("classifier.4.weight")
pretrained_dict.pop("classifier.4.bias")
return pretrained_dict
def forward(self, images):
"""
Forward pass
Args:
images: (N, 3, H_in, W_in), Input images
Returns
result: dict[torch.Tensor], Depth distribution result
features: (N, C, H_out, W_out), Image features
logits: (N, num_classes, H_out, W_out), Classification logits
aux: (N, num_classes, H_out, W_out), Auxillary classification logits
"""
# Preprocess images
x = self.preprocess(images)
# Extract features
result = OrderedDict()
features = self.model.backbone(x)
for _layer in self.feat_extract_layer:
result[_layer] = features[_layer]
return result
if 'features' in features.keys():
feat_shape = features['features'].shape[-2:]
else:
feat_shape = features['layer1'].shape[-2:]
# Prediction classification logits
x = features["out"] # comment the classifier to reduce memory
# x = self.model.classifier(x)
# x = F.interpolate(x, size=feat_shape, mode='bilinear', align_corners=False)
result["logits"] = x
# Prediction auxillary classification logits
if self.model.aux_classifier is not None:
x = features["aux"]
x = self.model.aux_classifier(x)
x = F.interpolate(x, size=feat_shape, mode='bilinear', align_corners=False)
result["aux"] = x
return result
def preprocess(self, images):
"""
Preprocess images
Args:
images: (N, 3, H, W), Input images
Return
x: (N, 3, H, W), Preprocessed images
"""
x = images
if self.pretrained:
# Match ResNet pretrained preprocessing
x = normalize(x, mean=self.norm_mean, std=self.norm_std)
return x.cuda()
class SemDeepLabV3(SegTemplate):
def __init__(self, backbone_name, **kwargs):
"""
Initializes SemDeepLabV3 model
Args:
backbone_name: string, ResNet Backbone Name [ResNet50/ResNet101]
"""
if backbone_name == "ResNet50":
constructor = torchvision.models.segmentation.deeplabv3_resnet50
elif backbone_name == "ResNet101":
constructor = torchvision.models.segmentation.deeplabv3_resnet101
else:
raise NotImplementedError
super().__init__(constructor=constructor, **kwargs)
import torch
import torch.nn as nn
import spconv.pytorch as spconv
from pcdet.ops.roiaware_pool3d.roiaware_pool3d_utils import points_in_boxes_gpu
from pcdet.models.backbones_3d.focal_sparse_conv.utils import split_voxels, check_repeat, FocalLoss
from pcdet.utils import common_utils
class FocalSparseConv(spconv.SparseModule):
expansion = 1
def __init__(self, inplanes, planes, voxel_stride, norm_fn=None, indice_key=None,
image_channel=3, kernel_size=3, padding=1, mask_multi=False, use_img=False,
topk=False, threshold=0.5, skip_mask_kernel=False, enlarge_voxel_channels=-1,
point_cloud_range=[-3, -40, 0, 1, 40, 70.4],
voxel_size = [0.1, 0.05, 0.05]):
super(FocalSparseConv, self).__init__()
self.conv = spconv.SubMConv3d(inplanes, planes, kernel_size=kernel_size, stride=1, bias=False, indice_key=indice_key)
self.bn1 = norm_fn(planes)
self.relu = nn.ReLU(True)
offset_channels = kernel_size**3
self.topk = topk
self.threshold = threshold
self.voxel_stride = voxel_stride
self.focal_loss = FocalLoss()
self.mask_multi = mask_multi
self.skip_mask_kernel = skip_mask_kernel
self.use_img = use_img
voxel_channel = enlarge_voxel_channels if enlarge_voxel_channels>0 else inplanes
in_channels = image_channel + voxel_channel if use_img else voxel_channel
self.conv_enlarge = spconv.SparseSequential(spconv.SubMConv3d(inplanes, enlarge_voxel_channels,
kernel_size=3, stride=1, padding=1, bias=False, indice_key=indice_key+'_enlarge'),
norm_fn(enlarge_voxel_channels),
nn.ReLU(True)) if enlarge_voxel_channels>0 else None
self.conv_imp = spconv.SubMConv3d(in_channels, offset_channels, kernel_size=3, stride=1, padding=1, bias=False, indice_key=indice_key+'_imp')
_step = int(kernel_size//2)
kernel_offsets = [[i, j, k] for i in range(-_step, _step+1) for j in range(-_step, _step+1) for k in range(-_step, _step+1)]
kernel_offsets.remove([0, 0, 0])
self.kernel_offsets = torch.Tensor(kernel_offsets).cuda()
self.inv_idx = torch.Tensor([2, 1, 0]).long().cuda()
self.point_cloud_range = torch.Tensor(point_cloud_range).cuda()
self.voxel_size = torch.Tensor(voxel_size).cuda()
def construct_multimodal_features(self, x, x_rgb, batch_dict, fuse_sum=False):
"""
Construct the multimodal features with both lidar sparse features and image features.
Args:
x: [N, C] lidar sparse features
x_rgb: [b, c, h, w] image features
batch_dict: input and output information during forward
fuse_sum: bool, manner for fusion, True - sum, False - concat
Return:
image_with_voxelfeatures: [N, C] fused multimodal features
"""
batch_index = x.indices[:, 0]
spatial_indices = x.indices[:, 1:] * self.voxel_stride
voxels_3d = spatial_indices * self.voxel_size + self.point_cloud_range[:3]
calibs = batch_dict['calib']
batch_size = batch_dict['batch_size']
h, w = batch_dict['images'].shape[2:]
if not x_rgb.shape == batch_dict['images'].shape:
x_rgb = nn.functional.interpolate(x_rgb, (h, w), mode='bilinear')
image_with_voxelfeatures = []
voxels_2d_int_list = []
filter_idx_list = []
for b in range(batch_size):
x_rgb_batch = x_rgb[b]
calib = calibs[b]
voxels_3d_batch = voxels_3d[batch_index==b]
voxel_features_sparse = x.features[batch_index==b]
# Reverse the point cloud transformations to the original coords.
if 'noise_scale' in batch_dict:
voxels_3d_batch[:, :3] /= batch_dict['noise_scale'][b]
if 'noise_rot' in batch_dict:
voxels_3d_batch = common_utils.rotate_points_along_z(voxels_3d_batch[:, self.inv_idx].unsqueeze(0), -batch_dict['noise_rot'][b].unsqueeze(0))[0, :, self.inv_idx]
if 'flip_x' in batch_dict:
voxels_3d_batch[:, 1] *= -1 if batch_dict['flip_x'][b] else 1
if 'flip_y' in batch_dict:
voxels_3d_batch[:, 2] *= -1 if batch_dict['flip_y'][b] else 1
voxels_2d, _ = calib.lidar_to_img(voxels_3d_batch[:, self.inv_idx].cpu().numpy())
voxels_2d_int = torch.Tensor(voxels_2d).to(x_rgb_batch.device).long()
filter_idx = (0<=voxels_2d_int[:, 1]) * (voxels_2d_int[:, 1] < h) * (0<=voxels_2d_int[:, 0]) * (voxels_2d_int[:, 0] < w)
filter_idx_list.append(filter_idx)
voxels_2d_int = voxels_2d_int[filter_idx]
voxels_2d_int_list.append(voxels_2d_int)
image_features_batch = torch.zeros((voxel_features_sparse.shape[0], x_rgb_batch.shape[0]), device=x_rgb_batch.device)
image_features_batch[filter_idx] = x_rgb_batch[:, voxels_2d_int[:, 1], voxels_2d_int[:, 0]].permute(1, 0)
if fuse_sum:
image_with_voxelfeature = image_features_batch + voxel_features_sparse
else:
image_with_voxelfeature = torch.cat([image_features_batch, voxel_features_sparse], dim=1)
image_with_voxelfeatures.append(image_with_voxelfeature)
image_with_voxelfeatures = torch.cat(image_with_voxelfeatures)
return image_with_voxelfeatures
def _gen_sparse_features(self, x, imps_3d, batch_dict, voxels_3d):
"""
Generate the output sparse features from the focal sparse conv.
Args:
x: [N, C], lidar sparse features
imps_3d: [N, kernelsize**3], the predicted importance values
batch_dict: input and output information during forward
voxels_3d: [N, 3], the 3d positions of voxel centers
"""
batch_size = x.batch_size
voxel_features_fore = []
voxel_indices_fore = []
voxel_features_back = []
voxel_indices_back = []
box_of_pts_cls_targets = []
mask_voxels = []
mask_kernel_list = []
for b in range(batch_size):
if self.training:
index = x.indices[:, 0]
batch_index = index==b
mask_voxel = imps_3d[batch_index, -1].sigmoid()
voxels_3d_batch = voxels_3d[batch_index].unsqueeze(0)
mask_voxels.append(mask_voxel)
gt_boxes = batch_dict['gt_boxes'][b, :, :-1].unsqueeze(0)
box_of_pts_batch = points_in_boxes_gpu(voxels_3d_batch[:, :, self.inv_idx], gt_boxes).squeeze(0)
box_of_pts_cls_targets.append(box_of_pts_batch>=0)
features_fore, indices_fore, features_back, indices_back, mask_kernel = split_voxels(x, b, imps_3d, voxels_3d, self.kernel_offsets, mask_multi=self.mask_multi, topk=self.topk, threshold=self.threshold)
mask_kernel_list.append(mask_kernel)
voxel_features_fore.append(features_fore)
voxel_indices_fore.append(indices_fore)
voxel_features_back.append(features_back)
voxel_indices_back.append(indices_back)
voxel_features_fore = torch.cat(voxel_features_fore, dim=0)
voxel_indices_fore = torch.cat(voxel_indices_fore, dim=0)
voxel_features_back = torch.cat(voxel_features_back, dim=0)
voxel_indices_back = torch.cat(voxel_indices_back, dim=0)
mask_kernel = torch.cat(mask_kernel_list, dim=0)
x_fore = spconv.SparseConvTensor(voxel_features_fore, voxel_indices_fore, x.spatial_shape, x.batch_size)
x_back = spconv.SparseConvTensor(voxel_features_back, voxel_indices_back, x.spatial_shape, x.batch_size)
loss_box_of_pts = 0
if self.training:
mask_voxels = torch.cat(mask_voxels)
box_of_pts_cls_targets = torch.cat(box_of_pts_cls_targets)
mask_voxels_two_classes = torch.cat([1-mask_voxels.unsqueeze(-1), mask_voxels.unsqueeze(-1)], dim=1)
loss_box_of_pts = self.focal_loss(mask_voxels_two_classes, box_of_pts_cls_targets.long())
return x_fore, x_back, loss_box_of_pts, mask_kernel
def combine_out(self, x_fore, x_back, remove_repeat=False):
"""
Combine the foreground and background sparse features together.
Args:
x_fore: [N1, C], foreground sparse features
x_back: [N2, C], background sparse features
remove_repeat: bool, whether to remove the spatial replicate features.
"""
x_fore_features = torch.cat([x_fore.features, x_back.features], dim=0)
x_fore_indices = torch.cat([x_fore.indices, x_back.indices], dim=0)
if remove_repeat:
index = x_fore_indices[:, 0]
features_out_list = []
indices_coords_out_list = []
for b in range(x_fore.batch_size):
batch_index = index==b
features_out, indices_coords_out, _ = check_repeat(x_fore_features[batch_index], x_fore_indices[batch_index], flip_first=False)
features_out_list.append(features_out)
indices_coords_out_list.append(indices_coords_out)
x_fore_features = torch.cat(features_out_list, dim=0)
x_fore_indices = torch.cat(indices_coords_out_list, dim=0)
x_fore = x_fore.replace_feature(x_fore_features)
x_fore.indices = x_fore_indices
return x_fore
def forward(self, x, batch_dict, x_rgb=None):
spatial_indices = x.indices[:, 1:] * self.voxel_stride
voxels_3d = spatial_indices * self.voxel_size + self.point_cloud_range[:3]
if self.use_img:
features_multimodal = self.construct_multimodal_features(x, x_rgb, batch_dict)
x_predict = spconv.SparseConvTensor(features_multimodal, x.indices, x.spatial_shape, x.batch_size)
else:
x_predict = self.conv_enlarge(x) if self.conv_enlarge else x
imps_3d = self.conv_imp(x_predict).features
x_fore, x_back, loss_box_of_pts, mask_kernel = self._gen_sparse_features(x, imps_3d, batch_dict, voxels_3d)
if not self.skip_mask_kernel:
x_fore = x_fore.replace_feature(x_fore.features * mask_kernel.unsqueeze(-1))
out = self.combine_out(x_fore, x_back, remove_repeat=True)
out = self.conv(out)
if self.use_img:
out = out.replace_feature(self.construct_multimodal_features(out, x_rgb, batch_dict, True))
out = out.replace_feature(self.bn1(out.features))
out = out.replace_feature(self.relu(out.features))
batch_dict['loss_box_of_pts'] += loss_box_of_pts
return out, batch_dict
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
class FocalLoss(nn.Module):
def __init__(self, gamma=2.0, eps=1e-7):
super(FocalLoss, self).__init__()
self.gamma = gamma
self.eps = eps
def one_hot(self, index, classes):
size = index.size() + (classes,)
view = index.size() + (1,)
mask = torch.Tensor(*size).fill_(0).to(index.device)
index = index.view(*view)
ones = 1.
if isinstance(index, Variable):
ones = Variable(torch.Tensor(index.size()).fill_(1).to(index.device))
mask = Variable(mask, volatile=index.volatile)
return mask.scatter_(1, index, ones)
def forward(self, input, target):
y = self.one_hot(target, input.size(-1))
logit = F.softmax(input, dim=-1)
logit = logit.clamp(self.eps, 1. - self.eps)
loss = -1 * y * torch.log(logit) # cross entropy
loss = loss * (1 - logit) ** self.gamma # focal loss
return loss.mean()
def sort_by_indices(features, indices, features_add=None):
"""
To sort the sparse features with its indices in a convenient manner.
Args:
features: [N, C], sparse features
indices: [N, 4], indices of sparse features
features_add: [N, C], additional features to sort
"""
idx = indices[:, 1:]
idx_sum = idx.select(1, 0) * idx[:, 1].max() * idx[:, 2].max() + idx.select(1, 1) * idx[:, 2].max() + idx.select(1, 2)
_, ind = idx_sum.sort()
features = features[ind]
indices = indices[ind]
if not features_add is None:
features_add = features_add[ind]
return features, indices, features_add
def check_repeat(features, indices, features_add=None, sort_first=True, flip_first=True):
"""
Check that whether there are replicate indices in the sparse features,
remove the replicate features if any.
"""
if sort_first:
features, indices, features_add = sort_by_indices(features, indices, features_add)
if flip_first:
features, indices = features.flip([0]), indices.flip([0])
if not features_add is None:
features_add=features_add.flip([0])
idx = indices[:, 1:].int()
idx_sum = torch.add(torch.add(idx.select(1, 0) * idx[:, 1].max() * idx[:, 2].max(), idx.select(1, 1) * idx[:, 2].max()), idx.select(1, 2))
_unique, inverse, counts = torch.unique_consecutive(idx_sum, return_inverse=True, return_counts=True, dim=0)
if _unique.shape[0] < indices.shape[0]:
perm = torch.arange(inverse.size(0), dtype=inverse.dtype, device=inverse.device)
features_new = torch.zeros((_unique.shape[0], features.shape[-1]), device=features.device)
features_new.index_add_(0, inverse.long(), features)
features = features_new
perm_ = inverse.new_empty(_unique.size(0)).scatter_(0, inverse, perm)
indices = indices[perm_].int()
if not features_add is None:
features_add_new = torch.zeros((_unique.shape[0],), device=features_add.device)
features_add_new.index_add_(0, inverse.long(), features_add)
features_add = features_add_new / counts
return features, indices, features_add
def split_voxels(x, b, imps_3d, voxels_3d, kernel_offsets, mask_multi=True, topk=True, threshold=0.5):
"""
Generate and split the voxels into foreground and background sparse features, based on the predicted importance values.
Args:
x: [N, C], input sparse features
b: int, batch size id
imps_3d: [N, kernelsize**3], the prediced importance values
voxels_3d: [N, 3], the 3d positions of voxel centers
kernel_offsets: [kernelsize**3, 3], the offset coords in an kernel
mask_multi: bool, whether to multiply the predicted mask to features
topk: bool, whether to use topk or threshold for selection
threshold: float, threshold value
"""
index = x.indices[:, 0]
batch_index = index==b
indices_ori = x.indices[batch_index]
features_ori = x.features[batch_index]
mask_voxel = imps_3d[batch_index, -1].sigmoid()
mask_kernel = imps_3d[batch_index, :-1].sigmoid()
if mask_multi:
features_ori *= mask_voxel.unsqueeze(-1)
if topk:
_, indices = mask_voxel.sort(descending=True)
indices_fore = indices[:int(mask_voxel.shape[0]*threshold)]
indices_back = indices[int(mask_voxel.shape[0]*threshold):]
else:
indices_fore = mask_voxel > threshold
indices_back = mask_voxel <= threshold
features_fore = features_ori[indices_fore]
coords_fore = indices_ori[indices_fore]
mask_kernel_fore = mask_kernel[indices_fore]
mask_kernel_bool = mask_kernel_fore>=threshold
voxel_kerels_imp = kernel_offsets.unsqueeze(0).repeat(mask_kernel_bool.shape[0],1, 1)
mask_kernel_fore = mask_kernel[indices_fore][mask_kernel_bool]
indices_fore_kernels = coords_fore[:, 1:].unsqueeze(1).repeat(1, kernel_offsets.shape[0], 1)
indices_with_imp = indices_fore_kernels + voxel_kerels_imp
selected_indices = indices_with_imp[mask_kernel_bool]
spatial_indices = (selected_indices[:, 0] >0) * (selected_indices[:, 1] >0) * (selected_indices[:, 2] >0) * \
(selected_indices[:, 0] < x.spatial_shape[0]) * (selected_indices[:, 1] < x.spatial_shape[1]) * (selected_indices[:, 2] < x.spatial_shape[2])
selected_indices = selected_indices[spatial_indices]
mask_kernel_fore = mask_kernel_fore[spatial_indices]
selected_indices = torch.cat([torch.ones((selected_indices.shape[0], 1), device=features_fore.device)*b, selected_indices], dim=1)
selected_features = torch.zeros((selected_indices.shape[0], features_ori.shape[1]), device=features_fore.device)
features_fore_cat = torch.cat([features_fore, selected_features], dim=0)
coords_fore = torch.cat([coords_fore, selected_indices], dim=0)
mask_kernel_fore = torch.cat([torch.ones(features_fore.shape[0], device=features_fore.device), mask_kernel_fore], dim=0)
features_fore, coords_fore, mask_kernel_fore = check_repeat(features_fore_cat, coords_fore, features_add=mask_kernel_fore)
features_back = features_ori[indices_back]
coords_back = indices_ori[indices_back]
return features_fore, coords_fore, features_back, coords_back, mask_kernel_fore
from functools import partial
import torch
import spconv.pytorch as spconv
import torch.nn as nn
from .focal_sparse_conv.focal_sparse_conv import FocalSparseConv
from .focal_sparse_conv.SemanticSeg.pyramid_ffn import PyramidFeat2D
class objDict:
@staticmethod
def to_object(obj: object, **data):
obj.__dict__.update(data)
class ConfigDict:
def __init__(self, name):
self.name = name
def __getitem__(self, item):
return getattr(self, item)
class SparseSequentialBatchdict(spconv.SparseSequential):
def __init__(self, *args, **kwargs):
super(SparseSequentialBatchdict, self).__init__(*args, **kwargs)
def forward(self, input, batch_dict=None):
for k, module in self._modules.items():
if module is None:
continue
if isinstance(module, (FocalSparseConv,)):
input, batch_dict = module(input, batch_dict)
else:
input = module(input)
return input, batch_dict
def post_act_block(in_channels, out_channels, kernel_size, indice_key=None, stride=1, padding=0,
conv_type='subm', norm_fn=None):
if conv_type == 'subm':
conv = spconv.SubMConv3d(in_channels, out_channels, kernel_size, bias=False, indice_key=indice_key)
elif conv_type == 'spconv':
conv = spconv.SparseConv3d(in_channels, out_channels, kernel_size, stride=stride, padding=padding,
bias=False, indice_key=indice_key)
elif conv_type == 'inverseconv':
conv = spconv.SparseInverseConv3d(in_channels, out_channels, kernel_size, indice_key=indice_key, bias=False)
else:
raise NotImplementedError
m = spconv.SparseSequential(
conv,
norm_fn(out_channels),
nn.ReLU(True),
)
return m
class SparseBasicBlock(spconv.SparseModule):
expansion = 1
def __init__(self, inplanes, planes, stride=1, norm_fn=None, downsample=None, indice_key=None):
super(SparseBasicBlock, self).__init__()
assert norm_fn is not None
bias = norm_fn is not None
self.conv1 = spconv.SubMConv3d(
inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=bias, indice_key=indice_key
)
self.bn1 = norm_fn(planes)
self.relu = nn.ReLU(True)
self.conv2 = spconv.SubMConv3d(
planes, planes, kernel_size=3, stride=stride, padding=1, bias=bias, indice_key=indice_key
)
self.bn2 = norm_fn(planes)
self.downsample = downsample
self.stride = stride
def forward(self, x):
identity = x
out = self.conv1(x)
out = out.replace_feature(self.bn1(out.features))
out = out.replace_feature(self.relu(out.features))
out = self.conv2(out)
out = out.replace_feature(self.bn2(out.features))
if self.downsample is not None:
identity = self.downsample(x)
out = out.replace_feature(out.features + identity.features)
out = out.replace_feature(self.relu(out.features))
return out
class VoxelBackBone8xFocal(nn.Module):
def __init__(self, model_cfg, input_channels, grid_size, **kwargs):
super().__init__()
self.model_cfg = model_cfg
norm_fn = partial(nn.BatchNorm1d, eps=1e-3, momentum=0.01)
self.sparse_shape = grid_size[::-1] + [1, 0, 0]
self.conv_input = spconv.SparseSequential(
spconv.SubMConv3d(input_channels, 16, 3, padding=1, bias=False, indice_key='subm1'),
norm_fn(16),
nn.ReLU(True),
)
block = post_act_block
use_img = model_cfg.get('USE_IMG', False)
topk = model_cfg.get('TOPK', True)
threshold = model_cfg.get('THRESHOLD', 0.5)
kernel_size = model_cfg.get('KERNEL_SIZE', 3)
mask_multi = model_cfg.get('MASK_MULTI', False)
skip_mask_kernel = model_cfg.get('SKIP_MASK_KERNEL', False)
skip_mask_kernel_image = model_cfg.get('SKIP_MASK_KERNEL_IMG', False)
enlarge_voxel_channels = model_cfg.get('ENLARGE_VOXEL_CHANNELS', -1)
img_pretrain = model_cfg.get('IMG_PRETRAIN', "../checkpoints/deeplabv3_resnet50_coco-cd0a2569.pth")
if use_img:
model_cfg_seg=dict(
name='SemDeepLabV3',
backbone='ResNet50',
num_class=21, # pretrained on COCO
args={"feat_extract_layer": ["layer1"],
"pretrained_path": img_pretrain},
channel_reduce={
"in_channels": [256],
"out_channels": [16],
"kernel_size": [1],
"stride": [1],
"bias": [False]
}
)
cfg_dict = ConfigDict('SemDeepLabV3')
objDict.to_object(cfg_dict, **model_cfg_seg)
self.semseg = PyramidFeat2D(optimize=True, model_cfg=cfg_dict)
self.conv_focal_multimodal = FocalSparseConv(16, 16, image_channel=model_cfg_seg['channel_reduce']['out_channels'][0],
topk=topk, threshold=threshold, use_img=True, skip_mask_kernel=skip_mask_kernel_image,
voxel_stride=1, norm_fn=norm_fn, indice_key='spconv_focal_multimodal')
special_spconv_fn = partial(FocalSparseConv, mask_multi=mask_multi, enlarge_voxel_channels=enlarge_voxel_channels,
topk=topk, threshold=threshold, kernel_size=kernel_size, padding=kernel_size//2,
skip_mask_kernel=skip_mask_kernel)
self.use_img = use_img
self.conv1 = SparseSequentialBatchdict(
block(16, 16, 3, norm_fn=norm_fn, padding=1, indice_key='subm1'),
special_spconv_fn(16, 16, voxel_stride=1, norm_fn=norm_fn, indice_key='focal1'),
)
self.conv2 =SparseSequentialBatchdict(
# [1600, 1408, 41] <- [800, 704, 21]
block(16, 32, 3, norm_fn=norm_fn, stride=2, padding=1, indice_key='spconv2', conv_type='spconv'),
block(32, 32, 3, norm_fn=norm_fn, padding=1, indice_key='subm2'),
block(32, 32, 3, norm_fn=norm_fn, padding=1, indice_key='subm2'),
special_spconv_fn(32, 32, voxel_stride=2, norm_fn=norm_fn, indice_key='focal2'),
)
self.conv3 = SparseSequentialBatchdict(
# [800, 704, 21] <- [400, 352, 11]
block(32, 64, 3, norm_fn=norm_fn, stride=2, padding=1, indice_key='spconv3', conv_type='spconv'),
block(64, 64, 3, norm_fn=norm_fn, padding=1, indice_key='subm3'),
block(64, 64, 3, norm_fn=norm_fn, padding=1, indice_key='subm3'),
special_spconv_fn(64, 64, voxel_stride=4, norm_fn=norm_fn, indice_key='focal3'),
)
self.conv4 = SparseSequentialBatchdict(
# [400, 352, 11] <- [200, 176, 5]
block(64, 64, 3, norm_fn=norm_fn, stride=2, padding=(0, 1, 1), indice_key='spconv4', conv_type='spconv'),
block(64, 64, 3, norm_fn=norm_fn, padding=1, indice_key='subm4'),
block(64, 64, 3, norm_fn=norm_fn, padding=1, indice_key='subm4'),
)
last_pad = 0
last_pad = self.model_cfg.get('last_pad', last_pad)
self.conv_out = spconv.SparseSequential(
# [200, 150, 5] -> [200, 150, 2]
spconv.SparseConv3d(64, 128, (3, 1, 1), stride=(2, 1, 1), padding=last_pad,
bias=False, indice_key='spconv_down2'),
norm_fn(128),
nn.ReLU(True),
)
self.num_point_features = 128
self.backbone_channels = {
'x_conv1': 16,
'x_conv2': 32,
'x_conv3': 64,
'x_conv4': 64
}
def forward(self, batch_dict):
"""
Args:
batch_dict:
batch_size: int
vfe_features: (num_voxels, C)
voxel_coords: (num_voxels, 4), [batch_idx, z_idx, y_idx, x_idx]
Returns:
batch_dict:
encoded_spconv_tensor: sparse tensor
"""
voxel_features, voxel_coords = batch_dict['voxel_features'], batch_dict['voxel_coords']
batch_size = batch_dict['batch_size']
input_sp_tensor = spconv.SparseConvTensor(
features=voxel_features,
indices=voxel_coords.int(),
spatial_shape=self.sparse_shape,
batch_size=batch_size
)
batch_dict['loss_box_of_pts'] = 0
x = self.conv_input(input_sp_tensor)
x_conv1, batch_dict = self.conv1(x, batch_dict)
if self.use_img:
x_image = self.semseg(batch_dict['images'])['layer1_feat2d']
x_conv1, batch_dict = self.conv_focal_multimodal(x_conv1, batch_dict, x_image)
x_conv2, batch_dict = self.conv2(x_conv1, batch_dict)
x_conv3, batch_dict = self.conv3(x_conv2, batch_dict)
x_conv4, batch_dict = self.conv4(x_conv3, batch_dict)
# for detection head
# [200, 176, 5] -> [200, 176, 2]
out = self.conv_out(x_conv4)
batch_dict.update({
'encoded_spconv_tensor': out,
'encoded_spconv_tensor_stride': 8
})
batch_dict.update({
'multi_scale_3d_features': {
'x_conv1': x_conv1,
'x_conv2': x_conv2,
'x_conv3': x_conv3,
'x_conv4': x_conv4,
}
})
batch_dict.update({
'multi_scale_3d_strides': {
'x_conv1': 1,
'x_conv2': 2,
'x_conv3': 4,
'x_conv4': 8,
}
})
return batch_dict
...@@ -12,6 +12,9 @@ class PVRCNN(Detector3DTemplate): ...@@ -12,6 +12,9 @@ class PVRCNN(Detector3DTemplate):
if self.training: if self.training:
loss, tb_dict, disp_dict = self.get_training_loss() loss, tb_dict, disp_dict = self.get_training_loss()
if 'loss_box_of_pts' in batch_dict:
loss += batch_dict['loss_box_of_pts']
tb_dict['loss_box_of_pts'] = batch_dict['loss_box_of_pts']
ret_dict = { ret_dict = {
'loss': loss 'loss': loss
......
...@@ -13,6 +13,10 @@ class VoxelRCNN(Detector3DTemplate): ...@@ -13,6 +13,10 @@ class VoxelRCNN(Detector3DTemplate):
if self.training: if self.training:
loss, tb_dict, disp_dict = self.get_training_loss() loss, tb_dict, disp_dict = self.get_training_loss()
if 'loss_box_of_pts' in batch_dict:
loss += batch_dict['loss_box_of_pts']
tb_dict['loss_box_of_pts'] = batch_dict['loss_box_of_pts']
ret_dict = { ret_dict = {
'loss': loss 'loss': loss
} }
......
import torch
def area(box) -> torch.Tensor:
"""
Computes the area of all the boxes.
Returns:
torch.Tensor: a vector with areas of each box.
"""
area = (box[:, 2] - box[:, 0]) * (box[:, 3] - box[:, 1])
return area
# implementation from https://github.com/kuangliu/torchcv/blob/master/torchcv/utils/box.py
# with slight modifications
def pairwise_iou(boxes1, boxes2) -> torch.Tensor:
"""
Given two lists of boxes of size N and M,
compute the IoU (intersection over union)
between __all__ N x M pairs of boxes.
The box order must be (xmin, ymin, xmax, ymax).
Args:
boxes1,boxes2 (Boxes): two `Boxes`. Contains N & M boxes, respectively.
Returns:
Tensor: IoU, sized [N,M].
"""
area1 = area(boxes1)
area2 = area(boxes2)
width_height = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) - torch.max(
boxes1[:, None, :2], boxes2[:, :2]
) # [N,M,2]
width_height.clamp_(min=0) # [N,M,2]
inter = width_height.prod(dim=2) # [N,M]
del width_height
# handle empty boxes
iou = torch.where(
inter > 0,
inter / (area1[:, None] + area2 - inter),
torch.zeros(1, dtype=inter.dtype, device=inter.device),
)
return iou
\ No newline at end of file
...@@ -52,6 +52,43 @@ def boxes_to_corners_3d(boxes3d): ...@@ -52,6 +52,43 @@ def boxes_to_corners_3d(boxes3d):
return corners3d.numpy() if is_numpy else corners3d return corners3d.numpy() if is_numpy else corners3d
def corners_rect_to_camera(corners):
"""
7 -------- 4
/| /|
6 -------- 5 .
| | | |
. 3 -------- 0
|/ |/
2 -------- 1
Args:
corners: (8, 3) [x0, y0, z0, ...], (x, y, z) is the point coordinate in image rect
Returns:
boxes_rect: (7,) [x, y, z, l, h, w, r] in rect camera coords
"""
height_group = [(0, 4), (1, 5), (2, 6), (3, 7)]
width_group = [(0, 1), (2, 3), (4, 5), (6, 7)]
length_group = [(0, 3), (1, 2), (4, 7), (5, 6)]
vector_group = [(0, 3), (1, 2), (4, 7), (5, 6)]
height, width, length = 0., 0., 0.
vector = np.zeros(2, dtype=np.float32)
for index_h, index_w, index_l, index_v in zip(height_group, width_group, length_group, vector_group):
height += np.linalg.norm(corners[index_h[0], :] - corners[index_h[1], :])
width += np.linalg.norm(corners[index_w[0], :] - corners[index_w[1], :])
length += np.linalg.norm(corners[index_l[0], :] - corners[index_l[1], :])
vector[0] += (corners[index_v[0], :] - corners[index_v[1], :])[0]
vector[1] += (corners[index_v[0], :] - corners[index_v[1], :])[2]
height, width, length = height*1.0/4, width*1.0/4, length*1.0/4
rotation_y = -np.arctan2(vector[1], vector[0])
center_point = corners.mean(axis=0)
center_point[1] += height/2
camera_rect = np.concatenate([center_point, np.array([length, height, width, rotation_y])])
return camera_rect
def mask_boxes_outside_range_numpy(boxes, limit_range, min_num_corners=1): def mask_boxes_outside_range_numpy(boxes, limit_range, min_num_corners=1):
""" """
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment