Unverified Commit 6c03a971 authored by Tai-Wang's avatar Tai-Wang Committed by GitHub
Browse files

Release v1.1.0rc1

Release v1.1.0rc1
parents 9611c2d0 ca42c312
# Copyright (c) OpenMMLab. All rights reserved.
from mmengine.hooks import Hook
from mmdet3d.registry import HOOKS
@HOOKS.register_module()
class BenchmarkHook(Hook):
"""A hook that logs the training speed of each epch."""
priority = 'NORMAL'
def after_train_epoch(self, runner) -> None:
"""We use the average throughput in iterations of the entire training
run and skip the first 50 iterations of each epoch to skip GPU warmup
time.
Args:
runner (Runner): The runner of the training process.
"""
message_hub = runner.message_hub
max_iter_num = len(runner.train_dataloader)
speed = message_hub.get_scalar('train/time').mean(max_iter_num - 50)
message_hub.update_scalar('train/speed', speed)
runner.logger.info(
f'Training speed of epoch {runner.epoch + 1} is {speed} s/iter')
def after_train(self, runner) -> None:
"""Log average training speed of entire training process.
Args:
runner (Runner): The runner of the training process.
"""
message_hub = runner.message_hub
avg_speed = message_hub.get_scalar('train/speed').mean()
runner.logger.info('Average training speed of entire training process'
f'is {avg_speed} s/iter')
......@@ -4,6 +4,7 @@ import warnings
from typing import Optional, Sequence
import mmcv
import numpy as np
from mmengine.fileio import FileClient
from mmengine.hooks import Hook
from mmengine.runner import Runner
......@@ -95,15 +96,27 @@ class Det3DVisualizationHook(Hook):
# is visualized for each evaluation.
total_curr_iter = runner.iter + batch_idx
data_input = dict()
# Visualize only the first data
img_path = outputs[0].img_path
img_bytes = self.file_client.get(img_path)
img = mmcv.imfrombytes(img_bytes, channel_order='rgb')
if 'img_path' in outputs[0]:
img_path = outputs[0].img_path
img_bytes = self.file_client.get(img_path)
img = mmcv.imfrombytes(img_bytes, channel_order='rgb')
data_input['img'] = img
if 'lidar_path' in outputs[0]:
lidar_path = outputs[0].lidar_path
num_pts_feats = outputs[0].num_pts_feats
pts_bytes = self.file_client.get(lidar_path)
points = np.frombuffer(pts_bytes, dtype=np.float32)
points = points.reshape(-1, num_pts_feats)
data_input['points'] = points
if total_curr_iter % self.interval == 0:
self._visualizer.add_datasample(
osp.basename(img_path) if self.show else 'val_img',
img,
'val sample',
data_input,
data_sample=outputs[0],
show=self.show,
wait_time=self.wait_time,
......@@ -135,9 +148,20 @@ class Det3DVisualizationHook(Hook):
for data_sample in outputs:
self._test_index += 1
img_path = data_sample.img_path
img_bytes = self.file_client.get(img_path)
img = mmcv.imfrombytes(img_bytes, channel_order='rgb')
data_input = dict()
if 'img_path' in data_sample:
img_path = data_sample.img_path
img_bytes = self.file_client.get(img_path)
img = mmcv.imfrombytes(img_bytes, channel_order='rgb')
data_input['img'] = img
if 'lidar_path' in data_sample:
lidar_path = data_sample.lidar_path
num_pts_feats = data_sample.num_pts_feats
pts_bytes = self.file_client.get(lidar_path)
points = np.frombuffer(pts_bytes, dtype=np.float32)
points = points.reshape(-1, num_pts_feats)
data_input['points'] = points
out_file = None
if self.test_out_dir is not None:
......@@ -145,8 +169,8 @@ class Det3DVisualizationHook(Hook):
out_file = osp.join(self.test_out_dir, out_file)
self._visualizer.add_datasample(
osp.basename(img_path) if self.show else 'test_img',
img,
'test sample',
data_input,
data_sample=data_sample,
show=self.show,
wait_time=self.wait_time,
......
......@@ -66,7 +66,8 @@ class KittiMetric(BaseMetric):
self.default_cam_key = default_cam_key
self.file_client_args = file_client_args
self.default_cam_key = default_cam_key
allowed_metrics = ['bbox', 'img_bbox', 'mAP']
allowed_metrics = ['bbox', 'img_bbox', 'mAP', 'LET_mAP']
self.metrics = metric if isinstance(metric, list) else [metric]
for metric in self.metrics:
if metric not in allowed_metrics:
......@@ -168,7 +169,7 @@ class KittiMetric(BaseMetric):
"""Compute the metrics from processed results.
Args:
results (list): The processed results of each batch.
results (list): The processed results of the whole dataset.
Returns:
Dict[str, float]: The computed metrics. The keys are the names of
......@@ -575,7 +576,7 @@ class KittiMetric(BaseMetric):
box_preds = box_dict['bboxes_3d']
scores = box_dict['scores_3d']
labels = box_dict['labels_3d']
sample_idx = info['sample_id']
sample_idx = info['sample_idx']
box_preds.limit_yaw(offset=0.5, period=np.pi * 2)
if len(box_preds) == 0:
......
......@@ -11,8 +11,9 @@ from mmengine.logging import MMLogger, print_log
from mmdet3d.models.layers import box3d_multiclass_nms
from mmdet3d.registry import METRICS
from mmdet3d.structures import (Box3DMode, LiDARInstance3DBoxes, bbox3d2result,
xywhr2xyxyr)
from mmdet3d.structures import (Box3DMode, CameraInstance3DBoxes,
LiDARInstance3DBoxes, bbox3d2result,
points_cam2img, xywhr2xyxyr)
from .kitti_metric import KittiMetric
......@@ -27,7 +28,7 @@ class WaymoMetric(KittiMetric):
Used for storing waymo evaluation programs.
split (str): The split of the evaluation set.
metric (str | list[str]): Metrics to be evaluated.
Default to 'bbox'.
Default to 'mAP'.
pcd_limit_range (list): The range of point cloud used to
filter invalid predicted boxes.
Default to [0, -40, -3, 70.4, 40, 0.0].
......@@ -54,13 +55,14 @@ class WaymoMetric(KittiMetric):
'gpu'. Defaults to 'cpu'.
file_client_args (dict): file client for reading gt in waymo format.
"""
num_cams = 5
def __init__(self,
ann_file: str,
waymo_bin_file: str,
data_root: str,
split: str = 'training',
metric: Union[str, List[str]] = 'bbox',
metric: Union[str, List[str]] = 'mAP',
pcd_limit_range: List[float] = [-85, -85, -5, 85, 85, 5],
prefix: Optional[str] = None,
pklfile_prefix: str = None,
......@@ -70,7 +72,6 @@ class WaymoMetric(KittiMetric):
use_pred_sample_idx: bool = False,
collect_device: str = 'cpu',
file_client_args: dict = dict(backend='disk')):
self.waymo_bin_file = waymo_bin_file
self.data_root = data_root
self.split = split
......@@ -92,7 +93,7 @@ class WaymoMetric(KittiMetric):
"""Compute the metrics from processed results.
Args:
results (list): The processed results of each batch.
results (list): The processed results of the whole dataset.
Returns:
Dict[str, float]: The computed metrics. The keys are the names of
......@@ -104,6 +105,35 @@ class WaymoMetric(KittiMetric):
# load annotations
self.data_infos = load(self.ann_file)['data_list']
# different from kitti, waymo do not need to convert the ann file
# handle the mono3d task
if self.task == 'mono3d':
new_data_infos = []
for info in self.data_infos:
height = info['images'][self.default_cam_key]['height']
width = info['images'][self.default_cam_key]['width']
for (cam_key, img_info) in info['images'].items():
camera_info = dict()
camera_info['images'] = dict()
camera_info['images'][cam_key] = img_info
# TODO remove the check by updating the data info;
if 'height' not in img_info:
img_info['height'] = height
img_info['width'] = width
if 'cam_instances' in info \
and cam_key in info['cam_instances']:
camera_info['instances'] = info['cam_instances'][
cam_key]
else:
camera_info['instances'] = []
camera_info['ego2global'] = info['ego2global']
if 'image_sweeps' in info:
camera_info['image_sweeps'] = info['image_sweeps']
# TODO check if need to modify the sample id
# TODO check when will use it except for evaluation.
camera_info['sample_id'] = info['sample_id']
new_data_infos.append(camera_info)
self.data_infos = new_data_infos
if self.pklfile_prefix is None:
eval_tmp_dir = tempfile.TemporaryDirectory()
......@@ -120,65 +150,141 @@ class WaymoMetric(KittiMetric):
submission_prefix=self.submission_prefix,
classes=self.classes)
import subprocess
eval_str = 'mmdet3d/evaluation/functional/waymo_utils/' + \
f'compute_detection_metrics_main {pklfile_prefix}.bin ' + \
f'{self.waymo_bin_file}'
print(eval_str)
ret_bytes = subprocess.check_output(eval_str, shell=True)
ret_texts = ret_bytes.decode('utf-8')
print_log(ret_texts, logger=logger)
ap_dict = {
'Vehicle/L1 mAP': 0,
'Vehicle/L1 mAPH': 0,
'Vehicle/L2 mAP': 0,
'Vehicle/L2 mAPH': 0,
'Pedestrian/L1 mAP': 0,
'Pedestrian/L1 mAPH': 0,
'Pedestrian/L2 mAP': 0,
'Pedestrian/L2 mAPH': 0,
'Sign/L1 mAP': 0,
'Sign/L1 mAPH': 0,
'Sign/L2 mAP': 0,
'Sign/L2 mAPH': 0,
'Cyclist/L1 mAP': 0,
'Cyclist/L1 mAPH': 0,
'Cyclist/L2 mAP': 0,
'Cyclist/L2 mAPH': 0,
'Overall/L1 mAP': 0,
'Overall/L1 mAPH': 0,
'Overall/L2 mAP': 0,
'Overall/L2 mAPH': 0
}
mAP_splits = ret_texts.split('mAP ')
mAPH_splits = ret_texts.split('mAPH ')
mAP_splits = ret_texts.split('mAP ')
mAPH_splits = ret_texts.split('mAPH ')
for idx, key in enumerate(ap_dict.keys()):
split_idx = int(idx / 2) + 1
if idx % 2 == 0: # mAP
ap_dict[key] = float(mAP_splits[split_idx].split(']')[0])
else: # mAPH
ap_dict[key] = float(mAPH_splits[split_idx].split(']')[0])
ap_dict['Overall/L1 mAP'] = \
(ap_dict['Vehicle/L1 mAP'] + ap_dict['Pedestrian/L1 mAP'] +
ap_dict['Cyclist/L1 mAP']) / 3
ap_dict['Overall/L1 mAPH'] = \
(ap_dict['Vehicle/L1 mAPH'] + ap_dict['Pedestrian/L1 mAPH'] +
ap_dict['Cyclist/L1 mAPH']) / 3
ap_dict['Overall/L2 mAP'] = \
(ap_dict['Vehicle/L2 mAP'] + ap_dict['Pedestrian/L2 mAP'] +
ap_dict['Cyclist/L2 mAP']) / 3
ap_dict['Overall/L2 mAPH'] = \
(ap_dict['Vehicle/L2 mAPH'] + ap_dict['Pedestrian/L2 mAPH'] +
ap_dict['Cyclist/L2 mAPH']) / 3
metric_dict = {}
for metric in self.metrics:
ap_dict = self.waymo_evaluate(
pklfile_prefix, metric=metric, logger=logger)
metric_dict[metric] = ap_dict
if eval_tmp_dir is not None:
eval_tmp_dir.cleanup()
if tmp_dir is not None:
tmp_dir.cleanup()
return metric_dict
def waymo_evaluate(self,
pklfile_prefix: str,
metric: str = None,
logger: MMLogger = None) -> dict:
"""Evaluation in Waymo protocol.
Args:
pklfile_prefix (str): The location that stored the prediction
results.
metric (str): Metric to be evaluated. Defaults to None.
logger (MMLogger, optional): Logger used for printing
related information during evaluation. Default: None.
Returns:
dict[str, float]: Results of each evaluation metric.
"""
import subprocess
if metric == 'mAP':
eval_str = 'mmdet3d/evaluation/functional/waymo_utils/' + \
f'compute_detection_metrics_main {pklfile_prefix}.bin ' + \
f'{self.waymo_bin_file}'
print(eval_str)
ret_bytes = subprocess.check_output(
'mmdet3d/evaluation/functional/waymo_utils/' +
f'compute_detection_metrics_main {pklfile_prefix}.bin ' +
f'{self.waymo_bin_file}',
shell=True)
ret_texts = ret_bytes.decode('utf-8')
print_log(ret_texts, logger=logger)
ap_dict = {
'Vehicle/L1 mAP': 0,
'Vehicle/L1 mAPH': 0,
'Vehicle/L2 mAP': 0,
'Vehicle/L2 mAPH': 0,
'Pedestrian/L1 mAP': 0,
'Pedestrian/L1 mAPH': 0,
'Pedestrian/L2 mAP': 0,
'Pedestrian/L2 mAPH': 0,
'Sign/L1 mAP': 0,
'Sign/L1 mAPH': 0,
'Sign/L2 mAP': 0,
'Sign/L2 mAPH': 0,
'Cyclist/L1 mAP': 0,
'Cyclist/L1 mAPH': 0,
'Cyclist/L2 mAP': 0,
'Cyclist/L2 mAPH': 0,
'Overall/L1 mAP': 0,
'Overall/L1 mAPH': 0,
'Overall/L2 mAP': 0,
'Overall/L2 mAPH': 0
}
mAP_splits = ret_texts.split('mAP ')
mAPH_splits = ret_texts.split('mAPH ')
mAP_splits = ret_texts.split('mAP ')
mAPH_splits = ret_texts.split('mAPH ')
for idx, key in enumerate(ap_dict.keys()):
split_idx = int(idx / 2) + 1
if idx % 2 == 0: # mAP
ap_dict[key] = float(mAP_splits[split_idx].split(']')[0])
else: # mAPH
ap_dict[key] = float(mAPH_splits[split_idx].split(']')[0])
ap_dict['Overall/L1 mAP'] = \
(ap_dict['Vehicle/L1 mAP'] + ap_dict['Pedestrian/L1 mAP'] +
ap_dict['Cyclist/L1 mAP']) / 3
ap_dict['Overall/L1 mAPH'] = \
(ap_dict['Vehicle/L1 mAPH'] + ap_dict['Pedestrian/L1 mAPH'] +
ap_dict['Cyclist/L1 mAPH']) / 3
ap_dict['Overall/L2 mAP'] = \
(ap_dict['Vehicle/L2 mAP'] + ap_dict['Pedestrian/L2 mAP'] +
ap_dict['Cyclist/L2 mAP']) / 3
ap_dict['Overall/L2 mAPH'] = \
(ap_dict['Vehicle/L2 mAPH'] + ap_dict['Pedestrian/L2 mAPH'] +
ap_dict['Cyclist/L2 mAPH']) / 3
elif metric == 'LET_mAP':
eval_str = 'mmdet3d/evaluation/functional/waymo_utils/' + \
f'compute_detection_let_metrics_main {pklfile_prefix}.bin ' + \
f'{self.waymo_bin_file}'
print(eval_str)
ret_bytes = subprocess.check_output(eval_str, shell=True)
ret_texts = ret_bytes.decode('utf-8')
print_log(ret_texts, logger=logger)
ap_dict = {
'Vehicle mAPL': 0,
'Vehicle mAP': 0,
'Vehicle mAPH': 0,
'Pedestrian mAPL': 0,
'Pedestrian mAP': 0,
'Pedestrian mAPH': 0,
'Sign mAPL': 0,
'Sign mAP': 0,
'Sign mAPH': 0,
'Cyclist mAPL': 0,
'Cyclist mAP': 0,
'Cyclist mAPH': 0,
'Overall mAPL': 0,
'Overall mAP': 0,
'Overall mAPH': 0
}
mAPL_splits = ret_texts.split('mAPL ')
mAP_splits = ret_texts.split('mAP ')
mAPH_splits = ret_texts.split('mAPH ')
for idx, key in enumerate(ap_dict.keys()):
split_idx = int(idx / 3) + 1
if idx % 3 == 0: # mAPL
ap_dict[key] = float(mAPL_splits[split_idx].split(']')[0])
elif idx % 3 == 1: # mAP
ap_dict[key] = float(mAP_splits[split_idx].split(']')[0])
else: # mAPH
ap_dict[key] = float(mAPH_splits[split_idx].split(']')[0])
ap_dict['Overall mAPL'] = \
(ap_dict['Vehicle mAPL'] + ap_dict['Pedestrian mAPL'] +
ap_dict['Cyclist mAPL']) / 3
ap_dict['Overall mAP'] = \
(ap_dict['Vehicle mAP'] + ap_dict['Pedestrian mAP'] +
ap_dict['Cyclist mAP']) / 3
ap_dict['Overall mAPH'] = \
(ap_dict['Vehicle mAPH'] + ap_dict['Pedestrian mAPH'] +
ap_dict['Cyclist mAPH']) / 3
return ap_dict
def format_results(self,
......@@ -254,7 +360,7 @@ class WaymoMetric(KittiMetric):
for cam_idx in range(self.num_cams):
box_dict[key].append(box_dict_per_frame[cam_idx][key])
# merge each elements
box_dict['sample_id'] = cam0_info['image_id']
box_dict['sample_idx'] = cam0_info['image_id']
for key in ['bbox', 'box3d_lidar', 'scores', 'label_preds']:
box_dict[key] = np.concatenate(box_dict[key])
......@@ -284,14 +390,14 @@ class WaymoMetric(KittiMetric):
nms_cfg.max_per_frame, nms_cfg)
lidar_boxes3d = LiDARInstance3DBoxes(boxes3d)
det = bbox3d2result(lidar_boxes3d, scores, labels)
box_preds_lidar = det['boxes_3d']
box_preds_lidar = det['bboxes_3d']
scores = det['scores_3d']
labels = det['labels_3d']
# box_preds_camera is in the cam0 system
rect = cam0_info['calib']['R0_rect'].astype(np.float32)
Trv2c = cam0_info['calib']['Tr_velo_to_cam'].astype(np.float32)
lidar2cam = cam0_info['images'][self.default_cam_key]['lidar2img']
lidar2cam = np.array(lidar2cam).astype(np.float32)
box_preds_camera = box_preds_lidar.convert_to(
Box3DMode.CAM, rect @ Trv2c, correct_yaw=True)
Box3DMode.CAM, np.linalg.inv(lidar2cam), correct_yaw=True)
# Note: bbox is meaningless in final evaluation, set to 0
merged_box_dict = dict(
bbox=np.zeros([box_preds_lidar.tensor.shape[0], 4]),
......@@ -299,7 +405,7 @@ class WaymoMetric(KittiMetric):
box3d_lidar=box_preds_lidar.tensor.numpy(),
scores=scores.numpy(),
label_preds=labels.numpy(),
sample_idx=box_dict['sample_idx'],
sample_idx=box_dict['sample_id'],
)
return merged_box_dict
......@@ -337,23 +443,31 @@ class WaymoMetric(KittiMetric):
annos = []
sample_idx = sample_id_list[idx]
info = self.data_infos[sample_idx]
# Here default used 'CAM2' to compute metric. If you want to
# use another camera, please modify it.
image_shape = (info['images'][self.default_cam_key]['height'],
info['images'][self.default_cam_key]['width'])
if self.task == 'mono3d':
if self.task == 'mono_det':
if idx % self.num_cams == 0:
box_dict_per_frame = []
cam0_idx = idx
box_dict = self.convert_valid_bboxes(pred_dicts, info)
cam0_key = list(info['images'].keys())[0]
cam0_info = info
# Here in mono3d, we use the 'CAM_FRONT' "the first
# index in the camera" as the default image shape.
# If you want to another camera, please modify it.
image_shape = (info['images'][cam0_key]['height'],
info['images'][cam0_key]['width'])
box_dict = self.convert_valid_bboxes(pred_dicts, info)
else:
box_dict = self.convert_valid_bboxes(pred_dicts, info)
# Here default used 'CAM_FRONT' to compute metric.
# If you want to use another camera, please modify it.
image_shape = (info['images'][self.default_cam_key]['height'],
info['images'][self.default_cam_key]['width'])
if self.task == 'mono3d':
box_dict_per_frame.append(box_dict)
if (idx + 1) % self.num_cams != 0:
continue
box_dict = self.merge_multi_view_boxes(
box_dict_per_frame, self.data_infos[cam0_idx])
box_dict = self.merge_multi_view_boxes(box_dict_per_frame,
cam0_info)
anno = {
'name': [],
'truncated': [],
......@@ -444,3 +558,106 @@ class WaymoMetric(KittiMetric):
print(f'Result is saved to {out}.')
return det_annos
def convert_valid_bboxes(self, box_dict: dict, info: dict):
"""Convert the predicted boxes into valid ones. Should handle the
different task mode (mono3d, mv3d, lidar), separately.
Args:
box_dict (dict): Box dictionaries to be converted.
- boxes_3d (:obj:`LiDARInstance3DBoxes`): 3D bounding boxes.
- scores_3d (torch.Tensor): Scores of boxes.
- labels_3d (torch.Tensor): Class labels of boxes.
info (dict): Data info.
Returns:
dict: Valid predicted boxes.
- bbox (np.ndarray): 2D bounding boxes.
- box3d_camera (np.ndarray): 3D bounding boxes in
camera coordinate.
- box3d_lidar (np.ndarray): 3D bounding boxes in
LiDAR coordinate.
- scores (np.ndarray): Scores of boxes.
- label_preds (np.ndarray): Class label predictions.
- sample_idx (int): Sample index.
"""
# TODO: refactor this function
box_preds = box_dict['bboxes_3d']
scores = box_dict['scores_3d']
labels = box_dict['labels_3d']
sample_idx = info['sample_id']
box_preds.limit_yaw(offset=0.5, period=np.pi * 2)
if len(box_preds) == 0:
return dict(
bbox=np.zeros([0, 4]),
box3d_camera=np.zeros([0, 7]),
box3d_lidar=np.zeros([0, 7]),
scores=np.zeros([0]),
label_preds=np.zeros([0, 4]),
sample_idx=sample_idx)
# Here default used 'CAM2' to compute metric. If you want to
# use another camera, please modify it.
if self.task in ['mv3d', 'lidar']:
cam_key = self.default_cam_key
elif self.task == 'mono3d':
cam_key = list(info['images'].keys())[0]
else:
raise NotImplementedError
lidar2cam = np.array(info['images'][cam_key]['lidar2cam']).astype(
np.float32)
P2 = np.array(info['images'][cam_key]['cam2img']).astype(np.float32)
img_shape = (info['images'][cam_key]['height'],
info['images'][cam_key]['width'])
P2 = box_preds.tensor.new_tensor(P2)
if isinstance(box_preds, LiDARInstance3DBoxes):
box_preds_camera = box_preds.convert_to(Box3DMode.CAM, lidar2cam)
box_preds_lidar = box_preds
elif isinstance(box_preds, CameraInstance3DBoxes):
box_preds_camera = box_preds
box_preds_lidar = box_preds.convert_to(Box3DMode.LIDAR,
np.linalg.inv(lidar2cam))
box_corners = box_preds_camera.corners
box_corners_in_image = points_cam2img(box_corners, P2)
# box_corners_in_image: [N, 8, 2]
minxy = torch.min(box_corners_in_image, dim=1)[0]
maxxy = torch.max(box_corners_in_image, dim=1)[0]
box_2d_preds = torch.cat([minxy, maxxy], dim=1)
# Post-processing
# check box_preds_camera
image_shape = box_preds.tensor.new_tensor(img_shape)
valid_cam_inds = ((box_2d_preds[:, 0] < image_shape[1]) &
(box_2d_preds[:, 1] < image_shape[0]) &
(box_2d_preds[:, 2] > 0) & (box_2d_preds[:, 3] > 0))
# check box_preds_lidar
if self.task in ['lidar', 'mono3d']:
limit_range = box_preds.tensor.new_tensor(self.pcd_limit_range)
valid_pcd_inds = ((box_preds_lidar.center > limit_range[:3]) &
(box_preds_lidar.center < limit_range[3:]))
valid_inds = valid_pcd_inds.all(-1)
elif self.task == 'mono3d':
valid_inds = valid_cam_inds
if valid_inds.sum() > 0:
return dict(
bbox=box_2d_preds[valid_inds, :].numpy(),
pred_box_type_3d=type(box_preds),
box3d_camera=box_preds_camera[valid_inds].tensor.numpy(),
box3d_lidar=box_preds_lidar[valid_inds].tensor.numpy(),
scores=scores[valid_inds].numpy(),
label_preds=labels[valid_inds].numpy(),
sample_idx=sample_idx)
else:
return dict(
bbox=np.zeros([0, 4]),
pred_box_type_3d=type(box_preds),
box3d_camera=np.zeros([0, 7]),
box3d_lidar=np.zeros([0, 7]),
scores=np.zeros([0]),
label_preds=np.zeros([0]),
sample_idx=sample_idx)
......@@ -92,7 +92,7 @@ def build_segmentor(cfg, train_cfg=None, test_cfg=None):
def build_model(cfg, train_cfg=None, test_cfg=None):
"""A function warpper for building 3D detector or segmentor according to
"""A function wrapper for building 3D detector or segmentor according to
cfg.
Should be deprecated in the future.
......
......@@ -13,7 +13,7 @@ from torch.nn import functional as F
from mmdet3d.registry import MODELS
from mmdet3d.utils import OptConfigType
from mmdet.models import DetDataPreprocessor
from mmdet.models.utils.misc import samplelist_boxlist2tensor
from .utils import multiview_img_stack_batch
@MODELS.register_module()
......@@ -75,7 +75,7 @@ class Det3DDataPreprocessor(DetDataPreprocessor):
seg_pad_value: int = 255,
bgr_to_rgb: bool = False,
rgb_to_bgr: bool = False,
boxlist2tensor: bool = True,
boxtype2tensor: bool = True,
batch_augments: Optional[List[dict]] = None):
super().__init__(
mean=mean,
......@@ -88,7 +88,6 @@ class Det3DDataPreprocessor(DetDataPreprocessor):
seg_pad_value=seg_pad_value,
bgr_to_rgb=bgr_to_rgb,
rgb_to_bgr=rgb_to_bgr,
boxlist2tensor=boxlist2tensor,
batch_augments=batch_augments)
self.voxel = voxel
self.voxel_type = voxel_type
......@@ -104,10 +103,10 @@ class Det3DDataPreprocessor(DetDataPreprocessor):
``BaseDataPreprocessor``.
Args:
data (List[dict] | List[List[dict]]): data from dataloader.
The outer list always represent the batch size, when it is
a list[list[dict]], the inter list indicate test time
augmentation.
data (dict | List[dict]): data from dataloader.
The dict contains the whole batch data, when it is
a list[dict], the list indicate test time augmentation.
training (bool): Whether to enable training time augmentation.
Defaults to False.
......@@ -144,7 +143,6 @@ class Det3DDataPreprocessor(DetDataPreprocessor):
data = self.collate_data(data)
inputs, data_samples = data['inputs'], data['data_samples']
batch_inputs = dict()
if 'points' in inputs:
......@@ -169,9 +167,14 @@ class Det3DDataPreprocessor(DetDataPreprocessor):
'pad_shape': pad_shape
})
if self.boxlist2tensor:
if hasattr(self, 'boxtype2tensor') and self.boxtype2tensor:
from mmdet.models.utils.misc import \
samplelist_boxtype2tensor
samplelist_boxtype2tensor(data_samples)
elif hasattr(self, 'boxlist2tensor') and self.boxlist2tensor:
from mmdet.models.utils.misc import \
samplelist_boxlist2tensor
samplelist_boxlist2tensor(data_samples)
if self.pad_mask:
self.pad_gt_masks(data_samples)
......@@ -185,6 +188,23 @@ class Det3DDataPreprocessor(DetDataPreprocessor):
return {'inputs': batch_inputs, 'data_samples': data_samples}
def preprocess_img(self, _batch_img):
# channel transform
if self._channel_conversion:
_batch_img = _batch_img[[2, 1, 0], ...]
# Convert to float after channel conversion to ensure
# efficiency
_batch_img = _batch_img.float()
# Normalization.
if self._enable_normalize:
if self.mean.shape[0] == 3:
assert _batch_img.dim() == 3 and _batch_img.shape[0] == 3, (
'If the mean has 3 values, the input tensor '
'should in shape of (3, H, W), but got the '
f'tensor with shape {_batch_img.shape}')
_batch_img = (_batch_img - self.mean) / self.std
return _batch_img
def collate_data(self, data: dict) -> dict:
"""Copying data to the target device and Performs normalization、
padding and bgr2rgb conversion and stack based on
......@@ -203,30 +223,30 @@ class Det3DDataPreprocessor(DetDataPreprocessor):
if 'img' in data['inputs']:
_batch_imgs = data['inputs']['img']
# Process data with `pseudo_collate`.
if is_list_of(_batch_imgs, torch.Tensor):
batch_imgs = []
img_dim = _batch_imgs[0].dim()
for _batch_img in _batch_imgs:
# channel transform
if self._channel_conversion:
_batch_img = _batch_img[[2, 1, 0], ...]
# Convert to float after channel conversion to ensure
# efficiency
_batch_img = _batch_img.float()
# Normalization.
if self._enable_normalize:
if self.mean.shape[0] == 3:
assert _batch_img.dim(
) == 3 and _batch_img.shape[0] == 3, (
'If the mean has 3 values, the input tensor '
'should in shape of (3, H, W), but got the '
f'tensor with shape {_batch_img.shape}')
_batch_img = (_batch_img - self.mean) / self.std
if img_dim == 3: # standard img
_batch_img = self.preprocess_img(_batch_img)
elif img_dim == 4:
_batch_img = [
self.preprocess_img(_img) for _img in _batch_img
]
_batch_img = torch.stack(_batch_img, dim=0)
batch_imgs.append(_batch_img)
# Pad and stack Tensor.
batch_imgs = stack_batch(batch_imgs, self.pad_size_divisor,
self.pad_value)
if img_dim == 3:
batch_imgs = stack_batch(batch_imgs, self.pad_size_divisor,
self.pad_value)
elif img_dim == 4:
batch_imgs = multiview_img_stack_batch(
batch_imgs, self.pad_size_divisor, self.pad_value)
# Process data with `default_collate`.
elif isinstance(_batch_imgs, torch.Tensor):
assert _batch_imgs.dim() == 4, (
......@@ -270,6 +290,10 @@ class Det3DDataPreprocessor(DetDataPreprocessor):
if is_list_of(_batch_inputs, torch.Tensor):
batch_pad_shape = []
for ori_input in _batch_inputs:
if ori_input.dim() == 4:
# mean multiivew input, select ont of the
# image to calculate the pad shape
ori_input = ori_input[0]
pad_h = int(
np.ceil(ori_input.shape[1] /
self.pad_size_divisor)) * self.pad_size_divisor
......@@ -293,7 +317,7 @@ class Det3DDataPreprocessor(DetDataPreprocessor):
else:
raise TypeError('Output of `cast_data` should be a list of dict '
'or a tuple with inputs and data_samples, but got'
f'{type(data)} {data}')
f'{type(data)}: {data}')
return batch_pad_shape
@torch.no_grad()
......
# Copyright (c) OpenMMLab. All rights reserved.
from typing import List, Union
import torch
import torch.nn.functional as F
def multiview_img_stack_batch(
tensor_list: List[torch.Tensor],
pad_size_divisor: int = 1,
pad_value: Union[int, float] = 0) -> torch.Tensor:
"""
Compared to the stack_batch in mmengine.model.utils,
multiview_img_stack_batch further handle the multiview images.
see diff of padded_sizes[:, :-2] = 0 vs padded_sizees[:, 0] = 0 in line 47
Stack multiple tensors to form a batch and pad the tensor to the max
shape use the right bottom padding mode in these images. If
``pad_size_divisor > 0``, add padding to ensure the shape of each dim is
divisible by ``pad_size_divisor``.
Args:
tensor_list (List[Tensor]): A list of tensors with the same dim.
pad_size_divisor (int): If ``pad_size_divisor > 0``, add padding
to ensure the shape of each dim is divisible by
``pad_size_divisor``. This depends on the model, and many
models need to be divisible by 32. Defaults to 1
pad_value (int, float): The padding value. Defaults to 0.
Returns:
Tensor: The n dim tensor.
"""
assert isinstance(
tensor_list,
list), (f'Expected input type to be list, but got {type(tensor_list)}')
assert tensor_list, '`tensor_list` could not be an empty list'
assert len({
tensor.ndim
for tensor in tensor_list
}) == 1, (f'Expected the dimensions of all tensors must be the same, '
f'but got {[tensor.ndim for tensor in tensor_list]}')
dim = tensor_list[0].dim()
num_img = len(tensor_list)
all_sizes: torch.Tensor = torch.Tensor(
[tensor.shape for tensor in tensor_list])
max_sizes = torch.ceil(
torch.max(all_sizes, dim=0)[0] / pad_size_divisor) * pad_size_divisor
padded_sizes = max_sizes - all_sizes
# The first dim normally means channel, which should not be padded.
padded_sizes[:, :-2] = 0
if padded_sizes.sum() == 0:
return torch.stack(tensor_list)
# `pad` is the second arguments of `F.pad`. If pad is (1, 2, 3, 4),
# it means that padding the last dim with 1(left) 2(right), padding the
# penultimate dim to 3(top) 4(bottom). The order of `pad` is opposite of
# the `padded_sizes`. Therefore, the `padded_sizes` needs to be reversed,
# and only odd index of pad should be assigned to keep padding "right" and
# "bottom".
pad = torch.zeros(num_img, 2 * dim, dtype=torch.int)
pad[:, 1::2] = padded_sizes[:, range(dim - 1, -1, -1)]
batch_tensor = []
for idx, tensor in enumerate(tensor_list):
batch_tensor.append(
F.pad(tensor, tuple(pad[idx].tolist()), value=pad_value))
return torch.stack(batch_tensor)
......@@ -204,7 +204,7 @@ class Base3DDenseHead(BaseModule, metaclass=ABCMeta):
score_factors (list[Tensor], optional): Score factor for
all scale level, each is a 4D-tensor, has shape
(batch_size, num_priors * 1, H, W). Defaults to None.
batch_input_metas (list[dict], Optional): Batch image meta info.
batch_input_metas (list[dict], Optional): Batch inputs meta info.
Defaults to None.
cfg (ConfigDict, optional): Test / postprocessing
configuration, if None, test_cfg would be used.
......
......@@ -183,8 +183,7 @@ class PartA2RPNHead(Anchor3DHead):
result = self.class_agnostic_nms(mlvl_bboxes, mlvl_bboxes_for_nms,
mlvl_max_scores, mlvl_label_pred,
mlvl_cls_score, mlvl_dir_scores,
score_thr, cfg.nms_post, cfg,
input_meta)
score_thr, cfg, input_meta)
return result
def loss_and_predict(self,
......@@ -275,7 +274,7 @@ class PartA2RPNHead(Anchor3DHead):
mlvl_bboxes_for_nms: Tensor,
mlvl_max_scores: Tensor, mlvl_label_pred: Tensor,
mlvl_cls_score: Tensor, mlvl_dir_scores: Tensor,
score_thr: int, max_num: int, cfg: ConfigDict,
score_thr: int, cfg: ConfigDict,
input_meta: dict) -> Dict:
"""Class agnostic nms for single batch.
......@@ -291,7 +290,6 @@ class PartA2RPNHead(Anchor3DHead):
mlvl_dir_scores (torch.Tensor): Direction scores of
Multi-level bbox.
score_thr (int): Score threshold.
max_num (int): Max number of bboxes after nms.
cfg (:obj:`ConfigDict`): Training or testing config.
input_meta (dict): Contain pcd and img's meta info.
......@@ -339,9 +337,9 @@ class PartA2RPNHead(Anchor3DHead):
scores = torch.cat(scores, dim=0)
cls_scores = torch.cat(cls_scores, dim=0)
labels = torch.cat(labels, dim=0)
if bboxes.shape[0] > max_num:
if bboxes.shape[0] > cfg.nms_post:
_, inds = scores.sort(descending=True)
inds = inds[:max_num]
inds = inds[:cfg.nms_post]
bboxes = bboxes[inds, :]
labels = labels[inds]
scores = scores[inds]
......
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Dict, List, Optional, Tuple
import torch
from mmengine.model import BaseModule
from mmengine.structures import InstanceData
from torch import Tensor
from torch import nn as nn
from mmdet3d.models.builder import build_loss
from mmdet3d.models.layers import nms_bev, nms_normal_bev
from mmdet3d.registry import MODELS, TASK_UTILS
from mmdet3d.structures import xywhr2xyxyr
from mmdet3d.structures.bbox_3d import (DepthInstance3DBoxes,
from mmdet3d.structures.bbox_3d import (BaseInstance3DBoxes,
DepthInstance3DBoxes,
LiDARInstance3DBoxes)
from mmdet3d.structures.det3d_data_sample import SampleList
from mmdet3d.utils.typing import InstanceList
from mmdet.models.utils import multi_apply
......@@ -34,15 +40,15 @@ class PointRPNHead(BaseModule):
"""
def __init__(self,
num_classes,
train_cfg,
test_cfg,
pred_layer_cfg=None,
enlarge_width=0.1,
cls_loss=None,
bbox_loss=None,
bbox_coder=None,
init_cfg=None):
num_classes: int,
train_cfg: dict,
test_cfg: dict,
pred_layer_cfg: Optional[dict] = None,
enlarge_width: float = 0.1,
cls_loss: Optional[dict] = None,
bbox_loss: Optional[dict] = None,
bbox_coder: Optional[dict] = None,
init_cfg: Optional[dict] = None) -> None:
super().__init__(init_cfg=init_cfg)
self.num_classes = num_classes
self.train_cfg = train_cfg
......@@ -50,8 +56,8 @@ class PointRPNHead(BaseModule):
self.enlarge_width = enlarge_width
# build loss function
self.bbox_loss = build_loss(bbox_loss)
self.cls_loss = build_loss(cls_loss)
self.bbox_loss = MODELS.build(bbox_loss)
self.cls_loss = MODELS.build(cls_loss)
# build box coder
self.bbox_coder = TASK_UTILS.build(bbox_coder)
......@@ -67,7 +73,8 @@ class PointRPNHead(BaseModule):
input_channels=pred_layer_cfg.in_channels,
output_channels=self._get_reg_out_channels())
def _make_fc_layers(self, fc_cfg, input_channels, output_channels):
def _make_fc_layers(self, fc_cfg: dict, input_channels: int,
output_channels: int) -> nn.Sequential:
"""Make fully connect layers.
Args:
......@@ -102,7 +109,7 @@ class PointRPNHead(BaseModule):
# torch.cos(yaw) (1), torch.sin(yaw) (1)
return self.bbox_coder.code_size
def forward(self, feat_dict):
def forward(self, feat_dict: dict) -> Tuple[List[Tensor]]:
"""Forward pass.
Args:
......@@ -124,30 +131,35 @@ class PointRPNHead(BaseModule):
batch_size, -1, self._get_reg_out_channels())
return point_box_preds, point_cls_preds
def loss(self,
bbox_preds,
cls_preds,
points,
gt_bboxes_3d,
gt_labels_3d,
img_metas=None):
def loss_by_feat(
self,
bbox_preds: List[Tensor],
cls_preds: List[Tensor],
points: List[Tensor],
batch_gt_instances_3d: InstanceList,
batch_input_metas: Optional[List[dict]] = None,
batch_gt_instances_ignore: Optional[InstanceList] = None) -> Dict:
"""Compute loss.
Args:
bbox_preds (dict): Predictions from forward of PointRCNN RPN_Head.
cls_preds (dict): Classification from forward of PointRCNN
RPN_Head.
bbox_preds (list[torch.Tensor]): Predictions from forward of
PointRCNN RPN_Head.
cls_preds (list[torch.Tensor]): Classification from forward of
PointRCNN RPN_Head.
points (list[torch.Tensor]): Input points.
gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
bboxes of each sample.
gt_labels_3d (list[torch.Tensor]): Labels of each sample.
img_metas (list[dict], Optional): Contain pcd and img's meta info.
batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
gt_instances_3d. It usually includes ``bboxes_3d`` and
``labels_3d`` attributes.
batch_input_metas (list[dict]): Contain pcd and img's meta info.
batch_gt_instances_ignore (list[:obj:`InstanceData`], optional):
Batch of gt_instances_ignore. It includes ``bboxes`` attribute
data that is ignored during training and testing.
Defaults to None.
Returns:
dict: Losses of PointRCNN RPN module.
"""
targets = self.get_targets(points, gt_bboxes_3d, gt_labels_3d)
targets = self.get_targets(points, batch_gt_instances_3d)
(bbox_targets, mask_targets, positive_mask, negative_mask,
box_loss_weights, point_targets) = targets
......@@ -169,25 +181,25 @@ class PointRPNHead(BaseModule):
return losses
def get_targets(self, points, gt_bboxes_3d, gt_labels_3d):
def get_targets(self, points: List[Tensor],
batch_gt_instances_3d: InstanceList) -> Tuple[Tensor]:
"""Generate targets of PointRCNN RPN head.
Args:
points (list[torch.Tensor]): Points of each batch.
gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
bboxes of each batch.
gt_labels_3d (list[torch.Tensor]): Labels of each batch.
points (list[torch.Tensor]): Points in one batch.
batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
gt_instances_3d. It usually includes ``bboxes_3d`` and
``labels_3d`` attributes.
Returns:
tuple[torch.Tensor]: Targets of PointRCNN RPN head.
"""
# find empty example
for index in range(len(gt_labels_3d)):
if len(gt_labels_3d[index]) == 0:
fake_box = gt_bboxes_3d[index].tensor.new_zeros(
1, gt_bboxes_3d[index].tensor.shape[-1])
gt_bboxes_3d[index] = gt_bboxes_3d[index].new_box(fake_box)
gt_labels_3d[index] = gt_labels_3d[index].new_zeros(1)
gt_labels_3d = [
instances.labels_3d for instances in batch_gt_instances_3d
]
gt_bboxes_3d = [
instances.bboxes_3d for instances in batch_gt_instances_3d
]
(bbox_targets, mask_targets, positive_mask, negative_mask,
point_targets) = multi_apply(self.get_targets_single, points,
......@@ -202,7 +214,9 @@ class PointRPNHead(BaseModule):
return (bbox_targets, mask_targets, positive_mask, negative_mask,
box_loss_weights, point_targets)
def get_targets_single(self, points, gt_bboxes_3d, gt_labels_3d):
def get_targets_single(self, points: Tensor,
gt_bboxes_3d: BaseInstance3DBoxes,
gt_labels_3d: Tensor) -> Tuple[Tensor]:
"""Generate targets of PointRCNN RPN head for single batch.
Args:
......@@ -243,24 +257,34 @@ class PointRPNHead(BaseModule):
return (bbox_targets, mask_targets, positive_mask, negative_mask,
point_targets)
def get_bboxes(self,
points,
bbox_preds,
cls_preds,
input_metas,
rescale=False):
def predict_by_feat(self, points: Tensor, bbox_preds: List[Tensor],
cls_preds: List[Tensor], batch_input_metas: List[dict],
cfg: Optional[dict]) -> InstanceList:
"""Generate bboxes from RPN head predictions.
Args:
points (torch.Tensor): Input points.
bbox_preds (dict): Regression predictions from PointRCNN head.
cls_preds (dict): Class scores predictions from PointRCNN head.
input_metas (list[dict]): Point cloud and image's meta info.
rescale (bool, optional): Whether to rescale bboxes.
Defaults to False.
bbox_preds (list[tensor]): Regression predictions from PointRCNN
head.
cls_preds (list[tensor]): Class scores predictions from PointRCNN
head.
batch_input_metas (list[dict]): Batch inputs meta info.
cfg (ConfigDict, optional): Test / postprocessing
configuration.
Returns:
list[tuple[torch.Tensor]]: Bounding boxes, scores and labels.
list[:obj:`InstanceData`]: Detection results of each sample
after the post process.
Each item usually contains following keys.
- scores_3d (Tensor): Classification scores, has a shape
(num_instances, )
- labels_3d (Tensor): Labels of bboxes, has a shape
(num_instances, ).
- bboxes_3d (BaseInstance3DBoxes): Prediction of bboxes,
contains a tensor with shape (num_instances, C), where
C >= 7.
- cls_preds (torch.Tensor): Class score of each bbox.
"""
sem_scores = cls_preds.sigmoid()
obj_scores = sem_scores.max(-1)[0]
......@@ -271,30 +295,40 @@ class PointRPNHead(BaseModule):
for b in range(batch_size):
bbox3d = self.bbox_coder.decode(bbox_preds[b], points[b, ..., :3],
object_class[b])
mask = ~bbox3d.sum(dim=1).isinf()
bbox_selected, score_selected, labels, cls_preds_selected = \
self.class_agnostic_nms(obj_scores[b], sem_scores[b], bbox3d,
points[b, ..., :3], input_metas[b])
bbox = input_metas[b]['box_type_3d'](
bbox_selected.clone(),
box_dim=bbox_selected.shape[-1],
with_yaw=True)
results.append((bbox, score_selected, labels, cls_preds_selected))
self.class_agnostic_nms(obj_scores[b][mask],
sem_scores[b][mask, :],
bbox3d[mask, :],
points[b, ..., :3][mask, :],
batch_input_metas[b],
cfg.nms_cfg)
bbox_selected = batch_input_metas[b]['box_type_3d'](
bbox_selected, box_dim=bbox_selected.shape[-1])
result = InstanceData()
result.bboxes_3d = bbox_selected
result.scores_3d = score_selected
result.labels_3d = labels
result.cls_preds = cls_preds_selected
results.append(result)
return results
def class_agnostic_nms(self, obj_scores, sem_scores, bbox, points,
input_meta):
def class_agnostic_nms(self, obj_scores: Tensor, sem_scores: Tensor,
bbox: Tensor, points: Tensor, input_meta: Dict,
nms_cfg: Dict) -> Tuple[Tensor]:
"""Class agnostic nms.
Args:
obj_scores (torch.Tensor): Objectness score of bounding boxes.
sem_scores (torch.Tensor): Semantic class score of bounding boxes.
bbox (torch.Tensor): Predicted bounding boxes.
points (torch.Tensor): Input points.
input_meta (dict): Contain pcd and img's meta info.
nms_cfg (dict): NMS config dict.
Returns:
tuple[torch.Tensor]: Bounding boxes, scores and labels.
"""
nms_cfg = self.test_cfg.nms_cfg if not self.training \
else self.train_cfg.nms_cfg
if nms_cfg.use_rotate_nms:
nms_func = nms_bev
else:
......@@ -323,14 +357,14 @@ class PointRPNHead(BaseModule):
bbox = bbox[nonempty_box_mask]
if self.test_cfg.score_thr is not None:
score_thr = self.test_cfg.score_thr
if nms_cfg.score_thr is not None:
score_thr = nms_cfg.score_thr
keep = (obj_scores >= score_thr)
obj_scores = obj_scores[keep]
sem_scores = sem_scores[keep]
bbox = bbox.tensor[keep]
if obj_scores.shape[0] > 0:
if bbox.tensor.shape[0] > 0:
topk = min(nms_cfg.nms_pre, obj_scores.shape[0])
obj_scores_nms, indices = torch.topk(obj_scores, k=topk)
bbox_for_nms = xywhr2xyxyr(bbox[indices].bev)
......@@ -343,15 +377,22 @@ class PointRPNHead(BaseModule):
score_selected = obj_scores_nms[keep]
cls_preds = sem_scores_nms[keep]
labels = torch.argmax(cls_preds, -1)
if bbox_selected.shape[0] > nms_cfg.nms_post:
_, inds = score_selected.sort(descending=True)
inds = inds[:score_selected.nms_post]
bbox_selected = bbox_selected[inds, :]
labels = labels[inds]
score_selected = score_selected[inds]
cls_preds = cls_preds[inds, :]
else:
bbox_selected = bbox.tensor
score_selected = obj_scores.new_zeros([0])
labels = obj_scores.new_zeros([0])
cls_preds = obj_scores.new_zeros([0, sem_scores.shape[-1]])
return bbox_selected, score_selected, labels, cls_preds
def _assign_targets_by_points_inside(self, bboxes_3d, points):
def _assign_targets_by_points_inside(self, bboxes_3d: BaseInstance3DBoxes,
points: Tensor) -> Tuple[Tensor]:
"""Compute assignment by checking whether point is inside bbox.
Args:
......@@ -379,3 +420,92 @@ class PointRPNHead(BaseModule):
raise NotImplementedError('Unsupported bbox type!')
return points_mask, assignment
def predict(self, feats_dict: Dict,
batch_data_samples: SampleList) -> InstanceList:
"""Perform forward propagation of the 3D detection head and predict
detection results on the features of the upstream network.
Args:
feats_dict (dict): Contains features from the first stage.
batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
samples. It usually includes information such as
`gt_instance_3d`, `gt_panoptic_seg_3d` and `gt_sem_seg_3d`.
Returns:
list[:obj:`InstanceData`]: Detection results of each sample
after the post process.
Each item usually contains following keys.
- scores_3d (Tensor): Classification scores, has a shape
(num_instances, )
- labels_3d (Tensor): Labels of bboxes, has a shape
(num_instances, ).
- bboxes_3d (BaseInstance3DBoxes): Prediction of bboxes,
contains a tensor with shape (num_instances, C), where
C >= 7.
"""
batch_input_metas = [
data_samples.metainfo for data_samples in batch_data_samples
]
raw_points = feats_dict.pop('raw_points')
bbox_preds, cls_preds = self(feats_dict)
proposal_cfg = self.test_cfg
proposal_list = self.predict_by_feat(
raw_points,
bbox_preds,
cls_preds,
cfg=proposal_cfg,
batch_input_metas=batch_input_metas)
feats_dict['points_cls_preds'] = cls_preds
return proposal_list
def loss_and_predict(self,
feats_dict: Dict,
batch_data_samples: SampleList,
proposal_cfg: Optional[dict] = None,
**kwargs) -> Tuple[dict, InstanceList]:
"""Perform forward propagation of the head, then calculate loss and
predictions from the features and data samples.
Args:
feats_dict (dict): Contains features from the first stage.
batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
samples. It usually includes information such as
`gt_instance_3d`, `gt_panoptic_seg_3d` and `gt_sem_seg_3d`.
proposal_cfg (ConfigDict, optional): Proposal config.
Returns:
tuple: the return value is a tuple contains:
- losses: (dict[str, Tensor]): A dictionary of loss components.
- predictions (list[:obj:`InstanceData`]): Detection
results of each sample after the post process.
"""
batch_gt_instances_3d = []
batch_gt_instances_ignore = []
batch_input_metas = []
for data_sample in batch_data_samples:
batch_input_metas.append(data_sample.metainfo)
batch_gt_instances_3d.append(data_sample.gt_instances_3d)
batch_gt_instances_ignore.append(
data_sample.get('ignored_instances', None))
raw_points = feats_dict.pop('raw_points')
bbox_preds, cls_preds = self(feats_dict)
loss_inputs = (bbox_preds, cls_preds,
raw_points) + (batch_gt_instances_3d, batch_input_metas,
batch_gt_instances_ignore)
losses = self.loss_by_feat(*loss_inputs)
predictions = self.predict_by_feat(
raw_points,
bbox_preds,
cls_preds,
batch_input_metas=batch_input_metas,
cfg=proposal_cfg)
feats_dict['points_cls_preds'] = cls_preds
if predictions[0].bboxes_3d.tensor.isinf().any():
print(predictions)
return losses, predictions
......@@ -14,7 +14,6 @@ from mmdet3d.structures.bbox_3d import (DepthInstance3DBoxes,
LiDARInstance3DBoxes,
rotation_3d_in_axis)
from mmdet.models.utils import multi_apply
from ..builder import build_loss
from .vote_head import VoteHead
......@@ -76,8 +75,8 @@ class SSD3DHead(VoteHead):
size_res_loss=size_res_loss,
semantic_loss=None,
init_cfg=init_cfg)
self.corner_loss = build_loss(corner_loss)
self.vote_loss = build_loss(vote_loss)
self.corner_loss = MODELS.build(corner_loss)
self.vote_loss = MODELS.build(vote_loss)
self.num_candidates = vote_module_cfg['num_points']
def _get_cls_out_channels(self) -> int:
......
# Copyright (c) OpenMMLab. All rights reserved.
from .base import Base3DDetector
from .centerpoint import CenterPoint
from .dfm import DfM
from .dynamic_voxelnet import DynamicVoxelNet
from .fcos_mono3d import FCOSMono3D
from .groupfree3dnet import GroupFree3DNet
from .h3dnet import H3DNet
from .imvotenet import ImVoteNet
from .imvoxelnet import ImVoxelNet
from .multiview_dfm import MultiViewDfM
from .mvx_faster_rcnn import DynamicMVXFasterRCNN, MVXFasterRCNN
from .mvx_two_stage import MVXTwoStageDetector
from .parta2 import PartA2
......@@ -19,9 +21,25 @@ from .votenet import VoteNet
from .voxelnet import VoxelNet
__all__ = [
'Base3DDetector', 'VoxelNet', 'DynamicVoxelNet', 'MVXTwoStageDetector',
'DynamicMVXFasterRCNN', 'MVXFasterRCNN', 'PartA2', 'VoteNet', 'H3DNet',
'CenterPoint', 'SSD3DNet', 'ImVoteNet', 'SingleStageMono3DDetector',
'FCOSMono3D', 'ImVoxelNet', 'GroupFree3DNet', 'PointRCNN', 'SMOKEMono3D',
'SASSD'
'Base3DDetector',
'DfM',
'VoxelNet',
'DynamicVoxelNet',
'MVXTwoStageDetector',
'DynamicMVXFasterRCNN',
'MVXFasterRCNN',
'MultiViewDfM',
'PartA2',
'VoteNet',
'H3DNet',
'CenterPoint',
'SSD3DNet',
'ImVoteNet',
'SingleStageMono3DDetector',
'FCOSMono3D',
'ImVoxelNet',
'GroupFree3DNet',
'PointRCNN',
'SMOKEMono3D',
'SASSD',
]
......@@ -89,7 +89,7 @@ class Base3DDetector(BaseDetector):
raise RuntimeError(f'Invalid mode "{mode}". '
'Only supports loss, predict and tensor mode')
def convert_to_datasample(
def add_pred_to_datasample(
self,
data_samples: SampleList,
data_instances_3d: OptInstanceList = None,
......
# Copyright (c) OpenMMLab. All rights reserved.
import torch
from mmdet3d.registry import MODELS
from mmdet3d.structures.ops import bbox3d2result
from mmdet3d.utils import ConfigType
from mmdet.models.detectors import BaseDetector
from ..builder import build_backbone, build_head, build_neck
@MODELS.register_module()
class DfM(BaseDetector):
r"""`Monocular 3D Object Detection with Depth from Motion.
<https://arxiv.org/abs/2207.12988>`_.
Args:
backbone (:obj:`ConfigDict` or dict): The backbone config.
neck (:obj:`ConfigDict` or dict): The neck config.
backbone_stereo (:obj:`ConfigDict` or dict): The stereo backbone
config.
backbone_3d (:obj:`ConfigDict` or dict): The 3d backbone config.
neck_3d (:obj:`ConfigDict` or dict): The 3D neck config.
bbox_head_3d (:obj:`ConfigDict` or dict): The 3d bbox head config.
neck_2d (:obj:`ConfigDict` or dict, optional): The 2D neck config
for 2D object detection. Defaults to None.
bbox_head_2d (:obj:`ConfigDict` or dict, optional): The 2D bbox
head config for 2D object detection. Defaults to None.
depth_head_2d (:obj:`ConfigDict` or dict, optional): The 2D depth
head config for depth estimation in fov space. Defaults to None.
depth_head (:obj:`ConfigDict` or dict, optional): The depth head
config for depth estimation in 3D voxel projected to fov space .
train_cfg (:obj:`ConfigDict` or dict, optional): Config dict of
training hyper-parameters. Defaults to None.
test_cfg (:obj:`ConfigDict` or dict, optional): Config dict of test
hyper-parameters. Defaults to None.
pretrained (:obj: `ConfigDict` or dict optional): The pretrained
config.
init_cfg (:obj:`ConfigDict` or dict, optional): The initialization
config. Defaults to None.
"""
def __init__(self,
backbone: ConfigType,
neck: ConfigType,
backbone_stereo: ConfigType,
backbone_3d: ConfigType,
neck_3d: ConfigType,
bbox_head_3d: ConfigType,
neck_2d=None,
bbox_head_2d=None,
depth_head_2d=None,
depth_head=None,
train_cfg=None,
test_cfg=None,
pretrained=None,
init_cfg=None):
super().__init__(init_cfg=init_cfg)
self.backbone = build_backbone(backbone)
self.neck = build_neck(neck)
if backbone_stereo is not None:
backbone_stereo.update(cat_img_feature=self.neck.cat_img_feature)
backbone_stereo.update(in_sem_channels=self.neck.sem_channels[-1])
self.backbone_stereo = build_backbone(backbone_stereo)
assert self.neck.cat_img_feature == \
self.backbone_stereo.cat_img_feature
assert self.neck.sem_channels[
-1] == self.backbone_stereo.in_sem_channels
if backbone_3d is not None:
self.backbone_3d = build_backbone(backbone_3d)
if neck_3d is not None:
self.neck_3d = build_neck(neck_3d)
if neck_2d is not None:
self.neck_2d = build_neck(neck_2d)
if bbox_head_2d is not None:
self.bbox_head_2d = build_head(bbox_head_2d)
if depth_head_2d is not None:
self.depth_head_2d = build_head(depth_head_2d)
if depth_head is not None:
self.depth_head = build_head(depth_head)
self.depth_samples = self.depth_head.depth_samples
self.train_cfg = train_cfg
self.test_cfg = test_cfg
bbox_head_3d.update(train_cfg=train_cfg)
bbox_head_3d.update(test_cfg=test_cfg)
self.bbox_head_3d = build_head(bbox_head_3d)
@property
def with_backbone_3d(self):
"""Whether the detector has a 3D backbone."""
return hasattr(self, 'backbone_3d') and self.backbone_3d is not None
@property
def with_neck_3d(self):
"""Whether the detector has a 3D neck."""
return hasattr(self, 'neck_3d') and self.neck_3d is not None
@property
def with_neck_2d(self):
"""Whether the detector has a 2D neck."""
return hasattr(self, 'neck_2d') and self.neck_2d is not None
@property
def with_bbox_head_2d(self):
"""Whether the detector has a 2D detection head."""
return hasattr(self, 'bbox_head_2d') and self.bbox_head_2d is not None
@property
def with_depth_head_2d(self):
"""Whether the detector has a image-based depth head."""
return hasattr(self,
'depth_head_2d') and self.depth_head_2d is not None
@property
def with_depth_head(self):
"""Whether the detector has a frustum-based depth head."""
return hasattr(self, 'depth_head') and self.depth_head is not None
def extract_feat(self, img, img_metas):
"""Feature extraction for perspective-view images.
Args:
img (torch.Tensor): Images of shape [B, N, C_in, H, W].
img_metas (list): Image meta information. Each element corresponds
to a group of images. len(img_metas) == B.
Returns:
torch.Tensor: bev feature with shape [B, C_out, N_y, N_x].
"""
# split input img into current and previous ones
batch_size, N, C_in, H, W = img.shape
cur_imgs = img[:, 0]
prev_imgs = img[:, 1] # TODO: to support multiple prev imgs
# 2D backbone for feature extraction
cur_feats = self.backbone(cur_imgs)
cur_feats = [cur_imgs] + list(cur_feats)
prev_feats = self.backbone(prev_imgs)
prev_feats = [prev_imgs] + list(prev_feats)
# SPP module as the feature neck
cur_stereo_feat, cur_sem_feat = self.neck(cur_feats)
prev_stereo_feat, prev_sem_feat = self.neck(prev_feats)
# derive cur2prevs
cur_pose = torch.tensor(
[img_meta['cam2global'] for img_meta in img_metas],
device=img.device)[:, None, :, :] # (B, 1, 4, 4)
prev_poses = []
for img_meta in img_metas:
sweep_img_metas = img_meta['sweep_img_metas']
prev_poses.append([
sweep_img_meta['cam2global']
for sweep_img_meta in sweep_img_metas
])
prev_poses = torch.tensor(prev_poses, device=img.device)
pad_prev_cam2global = torch.eye(4)[None, None].expand(
batch_size, N - 1, 4, 4).to(img.device)
pad_prev_cam2global[:, :, :prev_poses.shape[-2], :prev_poses.
shape[-1]] = prev_poses
pad_cur_cam2global = torch.eye(4)[None,
None].expand(batch_size, 1, 4,
4).to(img.device)
pad_cur_cam2global[:, :, :cur_pose.shape[-2], :cur_pose.
shape[-1]] = cur_pose
# (B, N-1, 4, 4) * (B, 1, 4, 4) -> (B, N-1, 4, 4)
# torch.linalg.solve is faster and more numerically stable
# than torch.matmul(torch.linalg.inv(A), B)
# empirical results show that torch.linalg.solve can derive
# almost the same result with np.linalg.inv
# while torch.linalg.inv can not
cur2prevs = torch.linalg.solve(pad_prev_cam2global, pad_cur_cam2global)
for meta_idx, img_meta in enumerate(img_metas):
img_meta['cur2prevs'] = cur2prevs[meta_idx]
# stereo backbone for depth estimation
# volume_feat: (batch_size, Cv, Nz, Ny, Nx)
volume_feat = self.backbone_stereo(cur_stereo_feat, prev_stereo_feat,
img_metas, cur_sem_feat)
# height compression
_, Cv, Nz, Ny, Nx = volume_feat.shape
bev_feat = volume_feat.view(batch_size, Cv * Nz, Ny, Nx)
bev_feat_prehg, bev_feat = self.neck_3d(bev_feat)
return bev_feat
def forward_train(self,
img,
img_metas,
gt_bboxes_3d,
gt_labels_3d,
depth_img=None,
**kwargs):
"""Forward function for training."""
bev_feat = self.extract_feat(img, img_metas)
outs = self.bbox_head_3d([bev_feat])
losses = self.bbox_head_3d.loss(*outs, gt_bboxes_3d, gt_labels_3d,
img_metas)
# TODO: loss_dense_depth, loss_2d, loss_imitation
return losses
def forward_test(self, img, img_metas, **kwargs):
"""Forward of testing.
Args:
img (torch.Tensor): Input images of shape (N, C_in, H, W).
img_metas (list): Image metas.
Returns:
list[dict]: Predicted 3d boxes.
"""
# not supporting aug_test for now
return self.simple_test(img, img_metas)
def simple_test(self, img, img_metas):
"""Simple inference forward without test time augmentation."""
bev_feat = self.extract_feat(img, img_metas)
# bbox_head takes a list of feature from different levels as input
# so need [bev_feat]
outs = self.bbox_head_3d([bev_feat])
bbox_list = self.bbox_head_3d.get_bboxes(*outs, img_metas)
bbox_results = [
bbox3d2result(det_bboxes, det_scores, det_labels)
for det_bboxes, det_scores, det_labels in bbox_list
]
# add pseudo-lidar label to each pred_dict for post-processing
for bbox_result in bbox_results:
bbox_result['pseudo_lidar'] = True
return bbox_results
def aug_test(self, imgs, img_metas, **kwargs):
"""Test with augmentations.
Args:
imgs (list[torch.Tensor]): Input images of shape (N, C_in, H, W).
img_metas (list): Image metas.
Returns:
list[dict]: Predicted 3d boxes.
"""
raise NotImplementedError
......@@ -10,7 +10,8 @@ from .voxelnet import VoxelNet
@MODELS.register_module()
class DynamicVoxelNet(VoxelNet):
r"""VoxelNet using `dynamic voxelization <https://arxiv.org/abs/1910.06528>`_.
r"""VoxelNet using `dynamic voxelization
<https://arxiv.org/abs/1910.06528>`_.
"""
def __init__(self,
......
......@@ -95,6 +95,7 @@ class FCOSMono3D(SingleStageMono3DDetector):
x = self.extract_feat(batch_inputs_dict)
results_list, results_list_2d = self.bbox_head.predict(
x, batch_data_samples, rescale=rescale)
predictions = self.convert_to_datasample(batch_data_samples,
results_list, results_list_2d)
predictions = self.add_pred_to_datasample(batch_data_samples,
results_list,
results_list_2d)
return predictions
......@@ -82,6 +82,6 @@ class GroupFree3DNet(SingleStage3DDetector):
points = batch_inputs_dict['points']
results_list = self.bbox_head.predict(points, x, batch_data_samples,
**kwargs)
predictions = self.convert_to_datasample(batch_data_samples,
results_list)
predictions = self.add_pred_to_datasample(batch_data_samples,
results_list)
return predictions
......@@ -154,4 +154,4 @@ class H3DNet(TwoStage3DDetector):
feats_dict,
batch_data_samples,
suffix='_optimized')
return self.convert_to_datasample(batch_data_samples, results_list)
return self.add_pred_to_datasample(batch_data_samples, results_list)
......@@ -433,7 +433,7 @@ class ImVoteNet(Base3DDetector):
if points is None:
assert imgs is not None
results_2d = self.predict_img_only(imgs, batch_data_samples)
return self.convert_to_datasample(
return self.add_pred_to_datasample(
batch_data_samples, data_instances_2d=results_2d)
else:
......@@ -488,7 +488,7 @@ class ImVoteNet(Base3DDetector):
batch_data_samples,
rescale=True)
return self.convert_to_datasample(batch_data_samples, results_3d)
return self.add_pred_to_datasample(batch_data_samples, results_3d)
def predict_img_only(self,
imgs: Tensor,
......
......@@ -2,16 +2,17 @@
from typing import List, Tuple, Union
import torch
from mmengine.structures import InstanceData
from mmdet3d.models.detectors import Base3DDetector
from mmdet3d.models.layers.fusion_layers.point_fusion import point_sample
from mmdet3d.registry import MODELS, TASK_UTILS
from mmdet3d.structures.det3d_data_sample import SampleList
from mmdet3d.utils import ConfigType, InstanceList, OptConfigType
from mmdet.models.detectors import BaseDetector
from mmdet3d.utils import ConfigType, OptConfigType, OptInstanceList
@MODELS.register_module()
class ImVoxelNet(BaseDetector):
class ImVoxelNet(Base3DDetector):
r"""`ImVoxelNet <https://arxiv.org/abs/2106.01178>`_.
Args:
......@@ -57,31 +58,6 @@ class ImVoxelNet(BaseDetector):
self.train_cfg = train_cfg
self.test_cfg = test_cfg
def convert_to_datasample(self, data_samples: SampleList,
data_instances: InstanceList) -> SampleList:
""" Convert results list to `Det3DDataSample`.
Args:
inputs (list[:obj:`Det3DDataSample`]): The input data.
data_instances (list[:obj:`InstanceData`]): 3D Detection
results of each image.
Returns:
list[:obj:`Det3DDataSample`]: 3D Detection results of the
input images. Each Det3DDataSample usually contain
'pred_instances_3d'. And the ``pred_instances_3d`` usually
contains following keys.
- scores_3d (Tensor): Classification scores, has a shape
(num_instance, )
- labels_3d (Tensor): Labels of bboxes, has a shape
(num_instances, ).
- bboxes_3d (Tensor): Contains a tensor with shape
(num_instances, C) where C >=7.
"""
for data_sample, pred_instances_3d in zip(data_samples,
data_instances):
data_sample.pred_instances_3d = pred_instances_3d
return data_samples
def extract_feat(self, batch_inputs_dict: dict,
batch_data_samples: SampleList):
"""Extract 3d features from the backbone -> fpn -> 3d projection.
......@@ -185,8 +161,8 @@ class ImVoxelNet(BaseDetector):
"""
x = self.extract_feat(batch_inputs_dict, batch_data_samples)
results_list = self.bbox_head.predict(x, batch_data_samples, **kwargs)
predictions = self.convert_to_datasample(batch_data_samples,
results_list)
predictions = self.add_pred_to_datasample(batch_data_samples,
results_list)
return predictions
def _forward(self, batch_inputs_dict: dict, batch_data_samples: SampleList,
......@@ -209,3 +185,64 @@ class ImVoxelNet(BaseDetector):
x = self.extract_feat(batch_inputs_dict, batch_data_samples)
results = self.bbox_head.forward(x)
return results
def convert_to_datasample(
self,
data_samples: SampleList,
data_instances_3d: OptInstanceList = None,
data_instances_2d: OptInstanceList = None,
) -> SampleList:
"""Convert results list to `Det3DDataSample`.
Subclasses could override it to be compatible for some multi-modality
3D detectors.
Args:
data_samples (list[:obj:`Det3DDataSample`]): The input data.
data_instances_3d (list[:obj:`InstanceData`], optional): 3D
Detection results of each sample.
data_instances_2d (list[:obj:`InstanceData`], optional): 2D
Detection results of each sample.
Returns:
list[:obj:`Det3DDataSample`]: Detection results of the
input. Each Det3DDataSample usually contains
'pred_instances_3d'. And the ``pred_instances_3d`` normally
contains following keys.
- scores_3d (Tensor): Classification scores, has a shape
(num_instance, )
- labels_3d (Tensor): Labels of 3D bboxes, has a shape
(num_instances, ).
- bboxes_3d (Tensor): Contains a tensor with shape
(num_instances, C) where C >=7.
When there are image prediction in some models, it should
contains `pred_instances`, And the ``pred_instances`` normally
contains following keys.
- scores (Tensor): Classification scores of image, has a shape
(num_instance, )
- labels (Tensor): Predict Labels of 2D bboxes, has a shape
(num_instances, ).
- bboxes (Tensor): Contains a tensor with shape
(num_instances, 4).
"""
assert (data_instances_2d is not None) or \
(data_instances_3d is not None),\
'please pass at least one type of data_samples'
if data_instances_2d is None:
data_instances_2d = [
InstanceData() for _ in range(len(data_instances_3d))
]
if data_instances_3d is None:
data_instances_3d = [
InstanceData() for _ in range(len(data_instances_2d))
]
for i, data_sample in enumerate(data_samples):
data_sample.pred_instances_3d = data_instances_3d[i]
data_sample.pred_instances = data_instances_2d[i]
return data_samples
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment