Commit 4cd43886 authored by lishj6's avatar lishj6 🏸
Browse files

init

parent a9a1fe81
# Copyright (c) OpenMMLab. All rights reserved.
import numpy as np
from mmcv.parallel import DataContainer as DC
from mmdet3d.core.bbox import BaseInstance3DBoxes
from mmdet3d.core.points import BasePoints
from mmdet.datasets.builder import PIPELINES
from mmdet.datasets.pipelines import to_tensor
from mmdet3d.datasets.pipelines import DefaultFormatBundle3D
@PIPELINES.register_module()
class CustomDefaultFormatBundle3D(DefaultFormatBundle3D):
"""Default formatting bundle.
It simplifies the pipeline of formatting common fields for voxels,
including "proposals", "gt_bboxes", "gt_labels", "gt_masks" and
"gt_semantic_seg".
These fields are formatted as follows.
- img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True)
- proposals: (1)to tensor, (2)to DataContainer
- gt_bboxes: (1)to tensor, (2)to DataContainer
- gt_bboxes_ignore: (1)to tensor, (2)to DataContainer
- gt_labels: (1)to tensor, (2)to DataContainer
"""
def __call__(self, results):
"""Call function to transform and format common fields in results.
Args:
results (dict): Result dict contains the data to convert.
Returns:
dict: The result dict contains the data that is formatted with
default bundle.
"""
# Format 3D data
results = super(CustomDefaultFormatBundle3D, self).__call__(results)
results['gt_map_masks'] = DC(
to_tensor(results['gt_map_masks']), stack=True)
return results
\ No newline at end of file
import numpy as np
from numpy import random
import mmcv
from mmdet.datasets.builder import PIPELINES
from mmcv.parallel import DataContainer as DC
@PIPELINES.register_module()
class PadMultiViewImage(object):
"""Pad the multi-view image.
There are two padding modes: (1) pad to a fixed size and (2) pad to the
minimum size that is divisible by some number.
Added keys are "pad_shape", "pad_fixed_size", "pad_size_divisor",
Args:
size (tuple, optional): Fixed padding size.
size_divisor (int, optional): The divisor of padded size.
pad_val (float, optional): Padding value, 0 by default.
"""
def __init__(self, size=None, size_divisor=None, pad_val=0):
self.size = size
self.size_divisor = size_divisor
self.pad_val = pad_val
# only one of size and size_divisor should be valid
assert size is not None or size_divisor is not None
assert size is None or size_divisor is None
def _pad_img(self, results):
"""Pad images according to ``self.size``."""
if self.size is not None:
padded_img = [mmcv.impad(
img, shape=self.size, pad_val=self.pad_val) for img in results['img']]
elif self.size_divisor is not None:
padded_img = [mmcv.impad_to_multiple(
img, self.size_divisor, pad_val=self.pad_val) for img in results['img']]
results['ori_shape'] = [img.shape for img in results['img']]
results['img'] = padded_img
results['img_shape'] = [img.shape for img in padded_img]
results['pad_shape'] = [img.shape for img in padded_img]
results['pad_fixed_size'] = self.size
results['pad_size_divisor'] = self.size_divisor
def __call__(self, results):
"""Call function to pad images, masks, semantic segmentation maps.
Args:
results (dict): Result dict from loading pipeline.
Returns:
dict: Updated result dict.
"""
self._pad_img(results)
return results
def __repr__(self):
repr_str = self.__class__.__name__
repr_str += f'(size={self.size}, '
repr_str += f'size_divisor={self.size_divisor}, '
repr_str += f'pad_val={self.pad_val})'
return repr_str
@PIPELINES.register_module()
class NormalizeMultiviewImage(object):
"""Normalize the image.
Added key is "img_norm_cfg".
Args:
mean (sequence): Mean values of 3 channels.
std (sequence): Std values of 3 channels.
to_rgb (bool): Whether to convert the image from BGR to RGB,
default is true.
"""
def __init__(self, mean, std, to_rgb=True):
self.mean = np.array(mean, dtype=np.float32)
self.std = np.array(std, dtype=np.float32)
self.to_rgb = to_rgb
def __call__(self, results):
"""Call function to normalize images.
Args:
results (dict): Result dict from loading pipeline.
Returns:
dict: Normalized results, 'img_norm_cfg' key is added into
result dict.
"""
results['img'] = [mmcv.imnormalize(img, self.mean, self.std, self.to_rgb) for img in results['img']]
results['img_norm_cfg'] = dict(
mean=self.mean, std=self.std, to_rgb=self.to_rgb)
return results
def __repr__(self):
repr_str = self.__class__.__name__
repr_str += f'(mean={self.mean}, std={self.std}, to_rgb={self.to_rgb})'
return repr_str
@PIPELINES.register_module()
class PhotoMetricDistortionMultiViewImage:
"""Apply photometric distortion to image sequentially, every transformation
is applied with a probability of 0.5. The position of random contrast is in
second or second to last.
1. random brightness
2. random contrast (mode 0)
3. convert color from BGR to HSV
4. random saturation
5. random hue
6. convert color from HSV to BGR
7. random contrast (mode 1)
8. randomly swap channels
Args:
brightness_delta (int): delta of brightness.
contrast_range (tuple): range of contrast.
saturation_range (tuple): range of saturation.
hue_delta (int): delta of hue.
"""
def __init__(self,
brightness_delta=32,
contrast_range=(0.5, 1.5),
saturation_range=(0.5, 1.5),
hue_delta=18):
self.brightness_delta = brightness_delta
self.contrast_lower, self.contrast_upper = contrast_range
self.saturation_lower, self.saturation_upper = saturation_range
self.hue_delta = hue_delta
def __call__(self, results):
"""Call function to perform photometric distortion on images.
Args:
results (dict): Result dict from loading pipeline.
Returns:
dict: Result dict with images distorted.
"""
imgs = results['img']
new_imgs = []
for img in imgs:
assert img.dtype == np.float32, \
'PhotoMetricDistortion needs the input image of dtype np.float32,'\
' please set "to_float32=True" in "LoadImageFromFile" pipeline'
# random brightness
if random.randint(2):
delta = random.uniform(-self.brightness_delta,
self.brightness_delta)
img += delta
# mode == 0 --> do random contrast first
# mode == 1 --> do random contrast last
mode = random.randint(2)
if mode == 1:
if random.randint(2):
alpha = random.uniform(self.contrast_lower,
self.contrast_upper)
img *= alpha
# convert color from BGR to HSV
img = mmcv.bgr2hsv(img)
# random saturation
if random.randint(2):
img[..., 1] *= random.uniform(self.saturation_lower,
self.saturation_upper)
# random hue
if random.randint(2):
img[..., 0] += random.uniform(-self.hue_delta, self.hue_delta)
img[..., 0][img[..., 0] > 360] -= 360
img[..., 0][img[..., 0] < 0] += 360
# convert color from HSV to BGR
img = mmcv.hsv2bgr(img)
# random contrast
if mode == 0:
if random.randint(2):
alpha = random.uniform(self.contrast_lower,
self.contrast_upper)
img *= alpha
# randomly swap channels
if random.randint(2):
img = img[..., random.permutation(3)]
new_imgs.append(img)
results['img'] = new_imgs
return results
def __repr__(self):
repr_str = self.__class__.__name__
repr_str += f'(\nbrightness_delta={self.brightness_delta},\n'
repr_str += 'contrast_range='
repr_str += f'{(self.contrast_lower, self.contrast_upper)},\n'
repr_str += 'saturation_range='
repr_str += f'{(self.saturation_lower, self.saturation_upper)},\n'
repr_str += f'hue_delta={self.hue_delta})'
return repr_str
@PIPELINES.register_module()
class CustomCollect3D(object):
"""Collect data from the loader relevant to the specific task.
This is usually the last stage of the data loader pipeline. Typically keys
is set to some subset of "img", "proposals", "gt_bboxes",
"gt_bboxes_ignore", "gt_labels", and/or "gt_masks".
The "img_meta" item is always populated. The contents of the "img_meta"
dictionary depends on "meta_keys". By default this includes:
- 'img_shape': shape of the image input to the network as a tuple \
(h, w, c). Note that images may be zero padded on the \
bottom/right if the batch tensor is larger than this shape.
- 'scale_factor': a float indicating the preprocessing scale
- 'flip': a boolean indicating if image flip transform was used
- 'filename': path to the image file
- 'ori_shape': original shape of the image as a tuple (h, w, c)
- 'pad_shape': image shape after padding
- 'lidar2img': transform from lidar to image
- 'depth2img': transform from depth to image
- 'cam2img': transform from camera to image
- 'pcd_horizontal_flip': a boolean indicating if point cloud is \
flipped horizontally
- 'pcd_vertical_flip': a boolean indicating if point cloud is \
flipped vertically
- 'box_mode_3d': 3D box mode
- 'box_type_3d': 3D box type
- 'img_norm_cfg': a dict of normalization information:
- mean: per channel mean subtraction
- std: per channel std divisor
- to_rgb: bool indicating if bgr was converted to rgb
- 'pcd_trans': point cloud transformations
- 'sample_idx': sample index
- 'pcd_scale_factor': point cloud scale factor
- 'pcd_rotation': rotation applied to point cloud
- 'pts_filename': path to point cloud file.
Args:
keys (Sequence[str]): Keys of results to be collected in ``data``.
meta_keys (Sequence[str], optional): Meta keys to be converted to
``mmcv.DataContainer`` and collected in ``data[img_metas]``.
Default: ('filename', 'ori_shape', 'img_shape', 'lidar2img',
'depth2img', 'cam2img', 'pad_shape', 'scale_factor', 'flip',
'pcd_horizontal_flip', 'pcd_vertical_flip', 'box_mode_3d',
'box_type_3d', 'img_norm_cfg', 'pcd_trans',
'sample_idx', 'pcd_scale_factor', 'pcd_rotation', 'pts_filename')
"""
def __init__(self,
keys,
meta_keys=('filename', 'ori_shape', 'img_shape', 'lidar2img','lidar2cam',
'depth2img', 'cam2img', 'pad_shape',
'scale_factor', 'flip', 'pcd_horizontal_flip',
'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d',
'img_norm_cfg', 'pcd_trans', 'sample_idx', 'prev_idx', 'next_idx',
'pcd_scale_factor', 'pcd_rotation', 'pts_filename',
'transformation_3d_flow', 'scene_token',
'can_bus',
)):
self.keys = keys
self.meta_keys = meta_keys
def __call__(self, results):
"""Call function to collect keys in results. The keys in ``meta_keys``
will be converted to :obj:`mmcv.DataContainer`.
Args:
results (dict): Result dict contains the data to collect.
Returns:
dict: The result dict contains the following keys
- keys in ``self.keys``
- ``img_metas``
"""
data = {}
img_metas = {}
for key in self.meta_keys:
if key in results:
img_metas[key] = results[key]
data['img_metas'] = DC(img_metas, cpu_only=True)
for key in self.keys:
if key not in results:
data[key] = None
else:
data[key] = results[key]
return data
def __repr__(self):
"""str: Return a string that describes the module."""
return self.__class__.__name__ + \
f'(keys={self.keys}, meta_keys={self.meta_keys})'
@PIPELINES.register_module()
class RandomScaleImageMultiViewImage(object):
"""Random scale the image
Args:
scales
"""
def __init__(self, scales=[]):
self.scales = scales
assert len(self.scales)==1
def __call__(self, results):
"""Call function to pad images, masks, semantic segmentation maps.
Args:
results (dict): Result dict from loading pipeline.
Returns:
dict: Updated result dict.
"""
rand_ind = np.random.permutation(range(len(self.scales)))[0]
rand_scale = self.scales[rand_ind]
y_size = [int(img.shape[0] * rand_scale) for img in results['img']]
x_size = [int(img.shape[1] * rand_scale) for img in results['img']]
scale_factor = np.eye(4)
scale_factor[0, 0] *= rand_scale
scale_factor[1, 1] *= rand_scale
results['img'] = [mmcv.imresize(img, (x_size[idx], y_size[idx]), return_scale=False) for idx, img in
enumerate(results['img'])]
lidar2img = [scale_factor @ l2i for l2i in results['lidar2img']]
results['lidar2img'] = lidar2img
results['img_shape'] = [img.shape for img in results['img']]
results['ori_shape'] = [img.shape for img in results['img']]
return results
def __repr__(self):
repr_str = self.__class__.__name__
repr_str += f'(size={self.scales}, '
return repr_str
\ No newline at end of file
from .group_sampler import DistributedGroupSampler
from .distributed_sampler import DistributedSampler
from .sampler import SAMPLER, build_sampler
import math
import torch
from torch.utils.data import DistributedSampler as _DistributedSampler
from .sampler import SAMPLER
@SAMPLER.register_module()
class DistributedSampler(_DistributedSampler):
def __init__(self,
dataset=None,
num_replicas=None,
rank=None,
shuffle=True,
seed=0):
super().__init__(
dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle)
# for the compatibility from PyTorch 1.3+
self.seed = seed if seed is not None else 0
def __iter__(self):
# deterministically shuffle based on epoch
if self.shuffle:
assert False
else:
indices = torch.arange(len(self.dataset)).tolist()
# add extra samples to make it evenly divisible
# in case that indices is shorter than half of total_size
indices = (indices *
math.ceil(self.total_size / len(indices)))[:self.total_size]
assert len(indices) == self.total_size
# subsample
per_replicas = self.total_size//self.num_replicas
# indices = indices[self.rank:self.total_size:self.num_replicas]
indices = indices[self.rank*per_replicas:(self.rank+1)*per_replicas]
assert len(indices) == self.num_samples
return iter(indices)
# Copyright (c) OpenMMLab. All rights reserved.
import math
import numpy as np
import torch
from mmcv.runner import get_dist_info
from torch.utils.data import Sampler
from .sampler import SAMPLER
import random
from IPython import embed
@SAMPLER.register_module()
class DistributedGroupSampler(Sampler):
"""Sampler that restricts data loading to a subset of the dataset.
It is especially useful in conjunction with
:class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
process can pass a DistributedSampler instance as a DataLoader sampler,
and load a subset of the original dataset that is exclusive to it.
.. note::
Dataset is assumed to be of constant size.
Arguments:
dataset: Dataset used for sampling.
num_replicas (optional): Number of processes participating in
distributed training.
rank (optional): Rank of the current process within num_replicas.
seed (int, optional): random seed used to shuffle the sampler if
``shuffle=True``. This number should be identical across all
processes in the distributed group. Default: 0.
"""
def __init__(self,
dataset,
samples_per_gpu=1,
num_replicas=None,
rank=None,
seed=0):
_rank, _num_replicas = get_dist_info()
if num_replicas is None:
num_replicas = _num_replicas
if rank is None:
rank = _rank
self.dataset = dataset
self.samples_per_gpu = samples_per_gpu
self.num_replicas = num_replicas
self.rank = rank
self.epoch = 0
self.seed = seed if seed is not None else 0
assert hasattr(self.dataset, 'flag')
self.flag = self.dataset.flag
self.group_sizes = np.bincount(self.flag)
self.num_samples = 0
for i, j in enumerate(self.group_sizes):
self.num_samples += int(
math.ceil(self.group_sizes[i] * 1.0 / self.samples_per_gpu /
self.num_replicas)) * self.samples_per_gpu
self.total_size = self.num_samples * self.num_replicas
def __iter__(self):
# deterministically shuffle based on epoch
g = torch.Generator()
g.manual_seed(self.epoch + self.seed)
indices = []
for i, size in enumerate(self.group_sizes):
if size > 0:
indice = np.where(self.flag == i)[0]
assert len(indice) == size
# add .numpy() to avoid bug when selecting indice in parrots.
# TODO: check whether torch.randperm() can be replaced by
# numpy.random.permutation().
indice = indice[list(
torch.randperm(int(size), generator=g).numpy())].tolist()
extra = int(
math.ceil(
size * 1.0 / self.samples_per_gpu / self.num_replicas)
) * self.samples_per_gpu * self.num_replicas - len(indice)
# pad indice
tmp = indice.copy()
for _ in range(extra // size):
indice.extend(tmp)
indice.extend(tmp[:extra % size])
indices.extend(indice)
assert len(indices) == self.total_size
indices = [
indices[j] for i in list(
torch.randperm(
len(indices) // self.samples_per_gpu, generator=g))
for j in range(i * self.samples_per_gpu, (i + 1) *
self.samples_per_gpu)
]
# subsample
offset = self.num_samples * self.rank
indices = indices[offset:offset + self.num_samples]
assert len(indices) == self.num_samples
return iter(indices)
def __len__(self):
return self.num_samples
def set_epoch(self, epoch):
self.epoch = epoch
from mmcv.utils.registry import Registry, build_from_cfg
SAMPLER = Registry('sampler')
def build_sampler(cfg, default_args):
return build_from_cfg(cfg, SAMPLER, default_args)
from .modeling import *
\ No newline at end of file
# Copyright 2021 Toyota Research Institute. All rights reserved.
#import functools
from collections import OrderedDict
import numpy as np
import seaborn as sns
from torch.utils.data import Dataset
from tqdm import tqdm
#from detectron2.data import MetadataCatalog
from detectron2.structures.boxes import BoxMode
from nuscenes.eval.detection.utils import category_to_detection_name
from nuscenes.nuscenes import NuScenes
from nuscenes.utils.splits import create_splits_scenes
#from tridet.data import collect_dataset_dicts
from projects.mmdet3d_plugin.dd3d.structures.boxes3d import GenericBoxes3D
from projects.mmdet3d_plugin.dd3d.structures.pose import Pose
from projects.mmdet3d_plugin.dd3d.utils.geometry import project_points3d
from projects.mmdet3d_plugin.dd3d.utils.visualization import float_to_uint8_color
# https://github.com/nutonomy/nuscenes-devkit/blob/9b209638ef3dee6d0cdc5ac700c493747f5b35fe/python-sdk/nuscenes/utils/splits.py#L189
# - train/val/test: The standard splits of the nuScenes dataset (700/150/150 scenes).
# - mini_train/mini_val: Train and val splits of the mini subset used for visualization and debugging (8/2 scenes).
# - train_detect/train_track: Two halves of the train split used for separating the training sets of detector and
# tracker if required
DATASET_NAME_TO_VERSION = {
"nusc_train": "v1.0-trainval",
"nusc_val": "v1.0-trainval",
"nusc_val-subsample-8": "v1.0-trainval",
"nusc_trainval": "v1.0-trainval",
"nusc_test": "v1.0-test",
"nusc_mini_train": "v1.0-mini",
"nusc_mini_val": "v1.0-mini",
}
CAMERA_NAMES = ('CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT', 'CAM_BACK', 'CAM_BACK_RIGHT')
ATTRIBUTE_IDS = {
'vehicle.moving': 0,
'vehicle.parked': 1,
'vehicle.stopped': 2,
'pedestrian.moving': 0,
'pedestrian.standing': 1,
'pedestrian.sitting_lying_down': 2,
'cycle.with_rider': 0,
'cycle.without_rider': 1,
}
CATEGORY_IDS = OrderedDict({
'barrier': 0,
'bicycle': 1,
'bus': 2,
'car': 3,
'construction_vehicle': 4,
'motorcycle': 5,
'pedestrian': 6,
'traffic_cone': 7,
'trailer': 8,
'truck': 9,
})
COLORS = [float_to_uint8_color(clr) for clr in sns.color_palette("bright", n_colors=10)]
COLORMAP = OrderedDict({
'barrier': COLORS[8], # yellow
'bicycle': COLORS[0], # blue
'bus': COLORS[6], # pink
'car': COLORS[2], # green
'construction_vehicle': COLORS[7], # gray
'motorcycle': COLORS[4], # purple
'pedestrian': COLORS[1], # orange
'traffic_cone': COLORS[3], # red
'trailer': COLORS[9], # skyblue
'truck': COLORS[5], # brown
})
MAX_NUM_ATTRIBUTES = 3
def _compute_iou(box1, box2):
"""
Parameters
----------
box1, box2:
(x1, y1, x2, y2)
"""
xx1 = max(box1[0], box2[0])
yy1 = max(box1[1], box2[1])
xx2 = min(box1[2], box2[2])
yy2 = min(box1[3], box2[3])
if xx1 >= xx2 or yy1 >= yy2:
return 0.
inter = (xx2 - xx1) * (yy2 - yy1)
a1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
a2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
return inter / (a1 + a2 - inter)
class NuscenesDataset(Dataset):
def __init__(self, name, data_root, datum_names=CAMERA_NAMES, min_num_lidar_points=3, min_box_visibility=0.2, **unused):
self.data_root = data_root
assert name in DATASET_NAME_TO_VERSION
version = DATASET_NAME_TO_VERSION[name]
self.nusc = NuScenes(version=version, dataroot=data_root, verbose=True)
self.datum_names = datum_names
self.min_num_lidar_points = min_num_lidar_points
self.min_box_visibility = min_box_visibility
self.dataset_item_info = self._build_dataset_item_info(name)
# Index instance tokens to their IDs
self._instance_token_to_id = self._index_instance_tokens()
# Construct the mapping from datum_token (image id) to index
print("Generating the mapping from image id to idx...")
self.datumtoken2idx = {}
for idx, (datum_token, _, _, _, _) in enumerate(self.dataset_item_info):
self.datumtoken2idx[datum_token] = idx
print("Done.")
def _build_dataset_item_info(self, name):
scenes_in_split = self._get_split_scenes(name)
dataset_items = []
for _, scene_token in tqdm(scenes_in_split):
scene = self.nusc.get('scene', scene_token)
sample_token = scene['first_sample_token']
for sample_idx in range(scene['nbr_samples']):
if name.endswith('subsample-8') and sample_idx % 8 > 0:
# Sample-level subsampling.
continue
sample = self.nusc.get('sample', sample_token)
for datum_name, datum_token in sample['data'].items():
if datum_name not in self.datum_names:
continue
dataset_items.append((datum_token, sample_token, scene['name'], sample_idx, datum_name))
sample_token = sample['next']
return dataset_items
def _get_split_scenes(self, name):
scenes_in_splits = create_splits_scenes()
if name == "nusc_trainval":
scenes = scenes_in_splits["train"] + scenes_in_splits["val"]
elif name == "nusc_val-subsample-8":
scenes = scenes_in_splits["val"]
else:
assert name.startswith('nusc_'), f"Invalid dataset name: {name}"
split = name[5:]
assert split in scenes_in_splits, f"Invalid dataset: {split}"
scenes = scenes_in_splits[split]
# Mapping from scene name to token.
name_to_token = {scene['name']: scene['token'] for scene in self.nusc.scene}
return [(name, name_to_token[name]) for name in scenes]
def __len__(self):
return len(self.dataset_item_info)
def _build_id(self, scene_name, sample_idx, datum_name):
sample_id = f"{scene_name}_{sample_idx:03d}"
image_id = f"{sample_id}_{datum_name}"
return image_id, sample_id
def _index_instance_tokens(self):
"""Index instance tokens for uniquely identifying instances across samples"""
instance_token_to_id = {}
for record in self.nusc.sample_annotation:
instance_token = record['instance_token']
if instance_token not in instance_token_to_id:
next_instance_id = len(instance_token_to_id)
instance_token_to_id[instance_token] = next_instance_id
return instance_token_to_id
def get_instance_annotations(self, annotation_list, K, image_shape, pose_WS):
annotations = []
for _ann in annotation_list:
ann = self.nusc.get('sample_annotation', _ann.token)
if ann['num_lidar_pts'] + ann['num_radar_pts'] < self.min_num_lidar_points:
continue
annotation = OrderedDict()
# --------
# Category
# --------
category = category_to_detection_name(ann['category_name'])
if category is None:
continue
annotation['category_id'] = CATEGORY_IDS[category]
# ------
# 3D box
# ------
# NOTE: ann['rotation'], ann['translation'] is in global frame.
pose_SO = Pose(wxyz=_ann.orientation, tvec=_ann.center) # pose in sensor frame
# DEBUG:
# pose_WO_1 = Pose(np.array(ann['rotation']), np.array(ann['translation']))
# pose_WO_2 = pose_WS * pose_SO
# assert np.allclose(pose_WO_1.matrix, pose_WO_2.matrix)
bbox3d = GenericBoxes3D(_ann.orientation, _ann.center, _ann.wlh)
annotation['bbox3d'] = bbox3d.vectorize().tolist()[0]
# --------------------------------------
# 2D box -- project 8 corners of 3D bbox
# --------------------------------------
corners = project_points3d(bbox3d.corners.cpu().numpy().squeeze(0), K)
l, t = corners[:, 0].min(), corners[:, 1].min()
r, b = corners[:, 0].max(), corners[:, 1].max()
x1 = max(0, l)
y1 = max(0, t)
x2 = min(image_shape[1], r)
y2 = min(image_shape[0], b)
iou = _compute_iou([l, t, r, b], [x1, y1, x2, y2])
if iou < self.min_box_visibility:
continue
annotation['bbox'] = [x1, y1, x2, y2]
annotation['bbox_mode'] = BoxMode.XYXY_ABS
# --------
# Track ID
# --------
annotation['track_id'] = self._instance_token_to_id[ann['instance_token']]
# ---------
# Attribute
# ---------
attr_tokens = ann['attribute_tokens']
assert len(attr_tokens) < 2 # NOTE: Allow only single attrubute.
attribute_id = MAX_NUM_ATTRIBUTES # By default, MAX_NUM_ATTRIBUTES -- this is to be ignored in loss compute.
if attr_tokens:
attribute = self.nusc.get('attribute', attr_tokens[0])['name']
attribute_id = ATTRIBUTE_IDS[attribute]
annotation['attribute_id'] = attribute_id
# -----
# Speed
# -----
vel_global = self.nusc.box_velocity(ann['token'])
speed = np.linalg.norm(vel_global) # NOTE: This can be NaN.
# DEBUG:
# speed * Quaternion(ann['rotation']).rotation_matrix.T[0] ~= vel_global
annotation['speed'] = speed
annotations.append(annotation)
return annotations
def _get_ego_velocity(self, current, max_time_diff=1.5):
"""Velocity of ego-vehicle in m/s.
"""
has_prev = current['prev'] != ''
has_next = current['next'] != ''
# Cannot estimate velocity for a single annotation.
if not has_prev and not has_next:
return np.array([np.nan, np.nan, np.nan])
if has_prev:
first = self.nusc.get('sample_data', current['prev'])
else:
first = current
if has_next:
last = self.nusc.get('sample_data', current['next'])
else:
last = current
pos_first = self.nusc.get('ego_pose', first['ego_pose_token'])['translation']
pos_last = self.nusc.get('ego_pose', last['ego_pose_token'])['translation']
pos_diff = np.float32(pos_last) - np.float32(pos_first)
time_last = 1e-6 * last['timestamp']
time_first = 1e-6 * first['timestamp']
time_diff = time_last - time_first
if has_next and has_prev:
# If doing centered difference, allow for up to double the max_time_diff.
max_time_diff *= 2
if time_diff > max_time_diff:
# If time_diff is too big, don't return an estimate.
return np.array([np.nan, np.nan, np.nan])
else:
return pos_diff / time_diff
def __getitem__(self, idx):
datum_token, sample_token, scene_name, sample_idx, datum_name = self.dataset_item_info[idx]
datum = self.nusc.get('sample_data', datum_token)
assert datum['is_key_frame']
filename, _annotations, K = self.nusc.get_sample_data(datum_token)
image_id, sample_id = self._build_id(scene_name, sample_idx, datum_name)
height, width = datum['height'], datum['width']
d2_dict = OrderedDict(
file_name=filename,
height=height,
width=width,
image_id=image_id,
sample_id=sample_id,
sample_token=sample_token
)
# Intrinsics
d2_dict['intrinsics'] = list(K.flatten())
# Get pose of the sensor (S) from vehicle (V) frame
_pose_VS = self.nusc.get('calibrated_sensor', datum['calibrated_sensor_token'])
pose_VS = Pose(wxyz=np.float64(_pose_VS['rotation']), tvec=np.float64(_pose_VS['translation']))
# Get ego-pose of the vehicle (V) from global/world (W) frame
_pose_WV = self.nusc.get('ego_pose', datum['ego_pose_token'])
pose_WV = Pose(wxyz=np.float64(_pose_WV['rotation']), tvec=np.float64(_pose_WV['translation']))
pose_WS = pose_WV * pose_VS
d2_dict['pose'] = {'wxyz': list(pose_WS.quat.elements), 'tvec': list(pose_WS.tvec)}
d2_dict['extrinsics'] = {'wxyz': list(pose_VS.quat.elements), 'tvec': list(pose_VS.tvec)}
d2_dict['ego_speed'] = np.linalg.norm(self._get_ego_velocity(datum))
d2_dict['annotations'] = self.get_instance_annotations(_annotations, K, (height, width), pose_WS)
return d2_dict
def getitem_by_datumtoken(self, datum_token):
# idx = self.datumtoken2idx[datum_token]
# ret = self.__getitem__(idx)
datum = self.nusc.get('sample_data', datum_token)
sample_token = datum['sample_token']
filename, _annotations, K = self.nusc.get_sample_data(datum_token)
height, width = datum['height'], datum['width']
d2_dict = OrderedDict(
file_name=filename,
height=height,
width=width,
image_id=0,
sample_id=0,
sample_token=sample_token
)
# Intrinsics
d2_dict['intrinsics'] = list(K.flatten())
# Get pose of the sensor (S) from vehicle (V) frame
_pose_VS = self.nusc.get('calibrated_sensor', datum['calibrated_sensor_token'])
pose_VS = Pose(wxyz=np.float64(_pose_VS['rotation']), tvec=np.float64(_pose_VS['translation']))
# Get ego-pose of the vehicle (V) from global/world (W) frame
_pose_WV = self.nusc.get('ego_pose', datum['ego_pose_token'])
pose_WV = Pose(wxyz=np.float64(_pose_WV['rotation']), tvec=np.float64(_pose_WV['translation']))
pose_WS = pose_WV * pose_VS
d2_dict['pose'] = {'wxyz': list(pose_WS.quat.elements), 'tvec': list(pose_WS.tvec)}
d2_dict['extrinsics'] = {'wxyz': list(pose_VS.quat.elements), 'tvec': list(pose_VS.tvec)}
d2_dict['ego_speed'] = np.linalg.norm(self._get_ego_velocity(datum))
d2_dict['annotations'] = self.get_instance_annotations(_annotations, K, (height, width), pose_WS)
return d2_dict
\ No newline at end of file
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
# Copyright 2021 Toyota Research Institute. All rights reserved.
# Adapted from detectron2:
# https://github.com/facebookresearch/detectron2/blob/master/detectron2/data/detection_utils.py
import numpy as np
import torch
from detectron2.data import transforms as T
from detectron2.structures import Boxes, BoxMode, Instances
from projects.mmdet3d_plugin.dd3d.structures.boxes3d import Boxes3D
__all__ = ["transform_instance_annotations", "annotations_to_instances"]
def transform_instance_annotations(
annotation,
transforms,
image_size,
):
"""Adapted from:
https://github.com/facebookresearch/detectron2/blob/master/detectron2/data/detection_utils.py#L254
The changes from original:
- The presence of 2D bounding box (i.e. "bbox" field) is assumed by default in d2; here it's optional.
- Add optional 3D bounding box support.
- If the instance mask annotation is in RLE, then it's decoded into polygons, not bitmask, to save memory.
===============================================================================================================
Apply transforms to box, segmentation and keypoints annotations of a single instance.
It will use `transforms.apply_box` for the box, and
`transforms.apply_coords` for segmentation polygons & keypoints.
If you need anything more specially designed for each data structure,
you'll need to implement your own version of this function or the transforms.
Args:
annotation (dict): dict of instance annotations for a single instance.
It will be modified in-place.
transforms (TransformList or list[Transform]):
image_size (tuple): the height, width of the transformed image
keypoint_hflip_indices (ndarray[int]): see `create_keypoint_hflip_indices`.
Returns:
dict:
the same input dict with fields "bbox", "segmentation", "keypoints"
transformed according to `transforms`.
The "bbox_mode" field will be set to XYXY_ABS.
"""
if isinstance(transforms, (tuple, list)):
transforms = T.TransformList(transforms)
# (dennis.park) Here 2D bounding box is optional.
if "bbox" in annotation:
assert "bbox_mode" in annotation, "'bbox' is present, but 'bbox_mode' is not."
# bbox is 1d (per-instance bounding box)
bbox = BoxMode.convert(annotation["bbox"], annotation["bbox_mode"], BoxMode.XYXY_ABS)
bbox = transforms.apply_box(np.array([bbox]))[0]
# clip transformed bbox to image size
bbox = bbox.clip(min=0)
bbox = np.minimum(bbox, list(image_size + image_size)[::-1])
annotation["bbox"] = bbox
annotation["bbox_mode"] = BoxMode.XYXY_ABS
# Vertical flipping is not implemented (`flip_transform.py`). TODO: implement if needed.
if "bbox3d" in annotation:
bbox3d = np.array(annotation["bbox3d"])
annotation['bbox3d'] = transforms.apply_box3d(bbox3d)
return annotation
def _create_empty_instances(image_size):
target = Instances(image_size)
target.gt_boxes = Boxes([])
target.gt_classes = torch.tensor([], dtype=torch.int64)
target.gt_boxes3d = Boxes3D.from_vectors([], torch.eye(3, dtype=torch.float32))
return target
def annotations_to_instances(
annos,
image_size,
intrinsics=None,
):
"""
Create an :class:`Instances` object used by the models,
from instance annotations in the dataset dict.
Args:
annos (list[dict]): a list of instance annotations in one image, each
element for one instance.
image_size (tuple): height, width
Returns:
Instances:
It will contain fields "gt_boxes", "gt_classes",
"gt_masks", "gt_keypoints", if they can be obtained from `annos`.
This is the format that builtin models expect.
"""
if len(annos) == 0:
return _create_empty_instances(image_size)
boxes = [BoxMode.convert(obj["bbox"], obj["bbox_mode"], BoxMode.XYXY_ABS) for obj in annos]
target = Instances(image_size)
target.gt_boxes = Boxes(boxes)
classes = [obj["category_id"] for obj in annos]
classes = torch.tensor(classes, dtype=torch.int64)
target.gt_classes = classes
if len(annos) and "bbox3d" in annos[0]:
assert intrinsics is not None
target.gt_boxes3d = Boxes3D.from_vectors([anno['bbox3d'] for anno in annos], intrinsics)
if len(target.gt_boxes3d) != target.gt_boxes.tensor.shape[0]:
raise ValueError(
f"The sizes of `gt_boxes3d` and `gt_boxes` do not match: a={len(target.gt_boxes3d)}, b={target.gt_boxes.tensor.shape[0]}."
)
# NOTE: add nuscenes attributes here
# NOTE: instances will be filtered later
# NuScenes attributes
if len(annos) and "attribute_id" in annos[0]:
attributes = [obj["attribute_id"] for obj in annos]
target.gt_attributes = torch.tensor(attributes, dtype=torch.int64)
# Speed (magnitude of velocity)
if len(annos) and "speed" in annos[0]:
speeds = [obj["speed"] for obj in annos]
target.gt_speeds = torch.tensor(speeds, dtype=torch.float32)
assert len(boxes) == len(classes) == len(attributes) == len(speeds), \
'the numbers of annotations should be the same'
return target
# Copyright 2021 Toyota Research Institute. All rights reserved.
# Adapted from AdelaiDet:
# https://github.com/aim-uofa/AdelaiDet/blob/master/adet/layers/iou_loss.py
import torch
from torch import nn
class IOULoss(nn.Module):
"""
Intersetion Over Union (IoU) loss which supports three
different IoU computations:
* IoU
* Linear IoU
* gIoU
"""
def __init__(self, loc_loss_type='iou'):
super(IOULoss, self).__init__()
self.loc_loss_type = loc_loss_type
def forward(self, pred, target, weight=None):
"""
Args:
pred: Nx4 predicted bounding boxes
target: Nx4 target bounding boxes
weight: N loss weight for each instance
"""
pred_left = pred[:, 0]
pred_top = pred[:, 1]
pred_right = pred[:, 2]
pred_bottom = pred[:, 3]
target_left = target[:, 0]
target_top = target[:, 1]
target_right = target[:, 2]
target_bottom = target[:, 3]
target_aera = (target_left + target_right) * \
(target_top + target_bottom)
pred_aera = (pred_left + pred_right) * \
(pred_top + pred_bottom)
w_intersect = torch.min(pred_left, target_left) + \
torch.min(pred_right, target_right)
h_intersect = torch.min(pred_bottom, target_bottom) + \
torch.min(pred_top, target_top)
g_w_intersect = torch.max(pred_left, target_left) + \
torch.max(pred_right, target_right)
g_h_intersect = torch.max(pred_bottom, target_bottom) + \
torch.max(pred_top, target_top)
ac_uion = g_w_intersect * g_h_intersect
area_intersect = w_intersect * h_intersect
area_union = target_aera + pred_aera - area_intersect
ious = (area_intersect + 1.0) / (area_union + 1.0)
gious = ious - (ac_uion - area_union) / ac_uion
if self.loc_loss_type == 'iou':
losses = -torch.log(ious)
elif self.loc_loss_type == 'linear_iou':
losses = 1 - ious
elif self.loc_loss_type == 'giou':
losses = 1 - gious
else:
raise NotImplementedError
if weight is not None:
return (losses * weight).sum()
else:
return losses.sum()
# Copyright 2021 Toyota Research Institute. All rights reserved.
# Adapted from AdelaiDet
# https://github.com/aim-uofa/AdelaiDet/
import logging
import torch
from torch import nn
LOG = logging.getLogger(__name__)
class Scale(nn.Module):
def __init__(self, init_value=1.0):
super(Scale, self).__init__()
self.scale = nn.Parameter(torch.FloatTensor([init_value]))
def forward(self, input):
return input * self.scale
class Offset(nn.Module):
def __init__(self, init_value=0.):
super(Offset, self).__init__()
self.bias = nn.Parameter(torch.FloatTensor([init_value]))
def forward(self, input):
return input + self.bias
class ModuleListDial(nn.ModuleList):
def __init__(self, modules=None):
super(ModuleListDial, self).__init__(modules)
self.cur_position = 0
def forward(self, x):
result = self[self.cur_position](x)
self.cur_position += 1
if self.cur_position >= len(self):
self.cur_position = 0
return result
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
# Copyright 2021 Toyota Research Institute. All rights reserved.
# Adapted from fvcore:
# https://github.com/facebookresearch/fvcore/blob/master/fvcore/nn/smooth_l1_loss.py
import torch
def smooth_l1_loss(input: torch.Tensor, target: torch.Tensor, beta: float, reduction: str = "none") -> torch.Tensor:
"""
Smooth L1 loss defined in the Fast R-CNN paper as:
| 0.5 * x ** 2 / beta if abs(x) < beta
smoothl1(x) = |
| abs(x) - 0.5 * beta otherwise,
where x = input - target.
Smooth L1 loss is related to Huber loss, which is defined as:
| 0.5 * x ** 2 if abs(x) < beta
huber(x) = |
| beta * (abs(x) - 0.5 * beta) otherwise
Smooth L1 loss is equal to huber(x) / beta. This leads to the following
differences:
- As beta -> 0, Smooth L1 loss converges to L1 loss, while Huber loss
converges to a constant 0 loss.
- As beta -> +inf, Smooth L1 converges to a constant 0 loss, while Huber loss
converges to L2 loss.
- For Smooth L1 loss, as beta varies, the L1 segment of the loss has a constant
slope of 1. For Huber loss, the slope of the L1 segment is beta.
Smooth L1 loss can be seen as exactly L1 loss, but with the abs(x) < beta
portion replaced with a quadratic function such that at abs(x) = beta, its
slope is 1. The quadratic segment smooths the L1 loss near x = 0.
Args:
input (Tensor): input tensor of any shape
target (Tensor): target value tensor with the same shape as input
beta (float): L1 to L2 change point.
For beta values < 1e-5, L1 loss is computed.
reduction: 'none' | 'mean' | 'sum'
'none': No reduction will be applied to the output.
'mean': The output will be averaged.
'sum': The output will be summed.
Returns:
The loss with the reduction option applied.
Note:
PyTorch's builtin "Smooth L1 loss" implementation does not actually
implement Smooth L1 loss, nor does it implement Huber loss. It implements
the special case of both in which they are equal (beta=1).
See: https://pytorch.org/docs/stable/nn.html#torch.nn.SmoothL1Loss.
"""
# (dennis.park) Make it work with mixed precision training.
beta = torch.as_tensor(beta).to(input.dtype)
if beta < 1e-5:
# if beta == 0, then torch.where will result in nan gradients when
# the chain rule is applied due to pytorch implementation details
# (the False branch "0.5 * n ** 2 / 0" has an incoming gradient of
# zeros, rather than "no gradient"). To avoid this issue, we define
# small values of beta to be exactly l1 loss.
loss = torch.abs(input - target)
else:
n = torch.abs(input - target)
cond = n < beta
a = 0.5 * n**2
b = n - 0.5 * beta
a, b = a.to(input.dtype), b.to(input.dtype)
loss = torch.where(cond, a, b)
# loss = torch.where(cond, 0.5 * n ** 2 / beta, n - 0.5 * beta)
if reduction == "mean":
loss = loss.mean()
elif reduction == "sum":
loss = loss.sum()
return loss
from .nuscenes_dd3d import NuscenesDD3D
\ No newline at end of file
# Copyright 2021 Toyota Research Institute. All rights reserved.
import torch
from torch import nn
#from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY
from detectron2.modeling.postprocessing import detector_postprocess as resize_instances
from detectron2.structures import Instances
from detectron2.layers import ShapeSpec
from mmcv.runner import force_fp32
from .fcos2d import FCOS2DHead, FCOS2DInference, FCOS2DLoss
from .fcos3d import FCOS3DHead, FCOS3DInference, FCOS3DLoss
#from tridet.modeling.dd3d.postprocessing import nuscenes_sample_aggregate
from .prepare_targets import DD3DTargetPreparer
#from tridet.modeling.feature_extractor import build_feature_extractor
from projects.mmdet3d_plugin.dd3d.structures.image_list import ImageList
from projects.mmdet3d_plugin.dd3d.utils.tensor2d import compute_features_locations as compute_locations_per_level
#@META_ARCH_REGISTRY.register()
class DD3D(nn.Module):
def __init__(self,
num_classes,
in_channels,
strides,
fcos2d_cfg=dict(),
fcos2d_loss_cfg=dict(),
fcos3d_cfg=dict(),
fcos3d_loss_cfg=dict(),
target_assign_cfg=dict(),
box3d_on=True,
feature_locations_offset="none"):
super().__init__()
# NOTE: do not need backbone
# self.backbone = build_feature_extractor(cfg)
# backbone_output_shape = self.backbone.output_shape()
# self.in_features = cfg.DD3D.IN_FEATURES or list(backbone_output_shape.keys())
self.backbone_output_shape = [ShapeSpec(channels=in_channels, stride=s) for s in strides]
self.feature_locations_offset = feature_locations_offset
self.fcos2d_head = FCOS2DHead(num_classes=num_classes, input_shape=self.backbone_output_shape,
**fcos2d_cfg)
self.fcos2d_loss = FCOS2DLoss(num_classes=num_classes, **fcos2d_loss_cfg)
# NOTE: inference later
# self.fcos2d_inference = FCOS2DInference(cfg)
if box3d_on:
self.fcos3d_head = FCOS3DHead(num_classes=num_classes, input_shape=self.backbone_output_shape,
**fcos3d_cfg)
self.fcos3d_loss = FCOS3DLoss(num_classes=num_classes, **fcos3d_loss_cfg)
# NOTE: inference later
# self.fcos3d_inference = FCOS3DInference(cfg)
self.only_box2d = False
else:
self.only_box2d = True
self.prepare_targets = DD3DTargetPreparer(num_classes=num_classes,
input_shape=self.backbone_output_shape,
box3d_on=box3d_on,
**target_assign_cfg)
# NOTE: inference later
# self.postprocess_in_inference = cfg.DD3D.INFERENCE.DO_POSTPROCESS
# self.do_nms = cfg.DD3D.INFERENCE.DO_NMS
# self.do_bev_nms = cfg.DD3D.INFERENCE.DO_BEV_NMS
# self.bev_nms_iou_thresh = cfg.DD3D.INFERENCE.BEV_NMS_IOU_THRESH
# nuScenes inference aggregates detections over all 6 cameras.
# self.nusc_sample_aggregate_in_inference = cfg.DD3D.INFERENCE.NUSC_SAMPLE_AGGREGATE
self.num_classes = num_classes
# NOTE: do not need normalize
# self.register_buffer("pixel_mean", torch.Tensor(cfg.MODEL.PIXEL_MEAN).view(-1, 1, 1))
# self.register_buffer("pixel_std", torch.Tensor(cfg.MODEL.PIXEL_STD).view(-1, 1, 1))
# NOTE:
# @property
# def device(self):
# return self.pixel_mean.device
# def preprocess_image(self, x):
# return (x - self.pixel_mean) / self.pixel_std
@force_fp32(apply_to=('features'))
def forward(self, features, batched_inputs):
# NOTE:
# images = [x["image"].to(self.device) for x in batched_inputs]
# images = [self.preprocess_image(x) for x in images]
# NOTE: directly use inv_intrinsics
# if 'intrinsics' in batched_inputs[0]:
# intrinsics = [x['intrinsics'].to(self.device) for x in batched_inputs]
# else:
# intrinsics = None
# images = ImageList.from_tensors(images, self.backbone.size_divisibility, intrinsics=intrinsics)
if 'inv_intrinsics' in batched_inputs[0]:
inv_intrinsics = [x['inv_intrinsics'].to(features[0].device) for x in batched_inputs]
inv_intrinsics = torch.stack(inv_intrinsics, dim=0)
else:
inv_intrinsics = None
# NOTE:
# gt_dense_depth = None
# if 'depth' in batched_inputs[0]:
# gt_dense_depth = [x["depth"].to(self.device) for x in batched_inputs]
# gt_dense_depth = ImageList.from_tensors(
# gt_dense_depth, self.backbone.size_divisibility, intrinsics=intrinsics
# )
# NOTE: directly input feature
# features = self.backbone(images.tensor)
# features = [features[f] for f in self.in_features]
if "instances" in batched_inputs[0]:
gt_instances = [x["instances"].to(features[0].device) for x in batched_inputs]
else:
gt_instances = None
locations = self.compute_locations(features)
logits, box2d_reg, centerness, _ = self.fcos2d_head(features)
if not self.only_box2d:
box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf, dense_depth = self.fcos3d_head(features)
# NOTE: directly use inv_intrinsics
# inv_intrinsics = images.intrinsics.inverse() if images.intrinsics is not None else None
if self.training:
assert gt_instances is not None
feature_shapes = [x.shape[-2:] for x in features]
training_targets = self.prepare_targets(locations, gt_instances, feature_shapes)
# NOTE:
# if gt_dense_depth is not None:
# training_targets.update({"dense_depth": gt_dense_depth})
losses = {}
fcos2d_loss, fcos2d_info = self.fcos2d_loss(logits, box2d_reg, centerness, training_targets)
losses.update(fcos2d_loss)
if not self.only_box2d:
fcos3d_loss = self.fcos3d_loss(
box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf, dense_depth, inv_intrinsics,
fcos2d_info, training_targets
)
losses.update(fcos3d_loss)
return losses
else:
# TODO: do not support inference now
raise NotImplementedError
pred_instances, fcos2d_info = self.fcos2d_inference(
logits, box2d_reg, centerness, locations, images.image_sizes
)
if not self.only_box2d:
# This adds 'pred_boxes3d' and 'scores_3d' to Instances in 'pred_instances' in place.
self.fcos3d_inference(
box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf, inv_intrinsics, pred_instances,
fcos2d_info
)
# 3D score == 2D score x confidence.
score_key = "scores_3d"
else:
score_key = "scores"
# Transpose to "image-first", i.e. (B, L)
pred_instances = list(zip(*pred_instances))
pred_instances = [Instances.cat(instances) for instances in pred_instances]
# 2D NMS and pick top-K.
if self.do_nms:
pred_instances = self.fcos2d_inference.nms_and_top_k(pred_instances, score_key)
if not self.only_box2d and self.do_bev_nms:
# Bird-eye-view NMS.
dummy_group_idxs = {i: [i] for i, _ in enumerate(pred_instances)}
if 'pose' in batched_inputs[0]:
poses = [x['pose'] for x in batched_inputs]
else:
poses = [x['extrinsics'] for x in batched_inputs]
pred_instances = nuscenes_sample_aggregate(
pred_instances,
dummy_group_idxs,
self.num_classes,
poses,
iou_threshold=self.bev_nms_iou_thresh,
include_boxes3d_global=False
)
if self.postprocess_in_inference:
processed_results = []
for results_per_image, input_per_image, image_size in \
zip(pred_instances, batched_inputs, images.image_sizes):
height = input_per_image.get("height", image_size[0])
width = input_per_image.get("width", image_size[1])
r = resize_instances(results_per_image, height, width)
processed_results.append({"instances": r})
else:
processed_results = [{"instances": x} for x in pred_instances]
return processed_results
def compute_locations(self, features):
locations = []
in_strides = [x.stride for x in self.backbone_output_shape]
for level, feature in enumerate(features):
h, w = feature.size()[-2:]
locations_per_level = compute_locations_per_level(
h, w, in_strides[level], feature.dtype, feature.device, offset=self.feature_locations_offset
)
locations.append(locations_per_level)
return locations
def forward_train(self, features, batched_inputs):
self.train()
return self.forward(features, batched_inputs)
\ No newline at end of file
# Copyright 2021 Toyota Research Institute. All rights reserved.
import logging
import torch
import torch.nn as nn
from projects.mmdet3d_plugin.dd3d.layers.smooth_l1_loss import smooth_l1_loss
LOG = logging.getLogger(__name__)
class DisentangledBox3DLoss(nn.Module):
def __init__(self, smooth_l1_loss_beta, max_loss_per_group):
super().__init__()
self.smooth_l1_loss_beta = smooth_l1_loss_beta
self.max_loss_per_group = max_loss_per_group
def forward(self, box3d_pred, box3d_targets, locations, weights=None):
box3d_pred = box3d_pred.to(torch.float32)
box3d_targets = box3d_targets.to(torch.float32)
target_corners = box3d_targets.corners
disentangled_losses = {}
for component_key in ["quat", "proj_ctr", "depth", "size"]:
disentangled_boxes = box3d_targets.clone()
setattr(disentangled_boxes, component_key, getattr(box3d_pred, component_key))
pred_corners = disentangled_boxes.to(torch.float32).corners
loss = smooth_l1_loss(pred_corners, target_corners, beta=self.smooth_l1_loss_beta)
# Bound the loss
loss.clamp(max=self.max_loss_per_group)
if weights is not None:
# loss = torch.sum(loss.reshape(-1, 24) * weights.unsqueeze(-1))
loss = torch.sum(loss.reshape(-1, 24).mean(dim=1) * weights)
else:
loss = loss.reshape(-1, 24).mean()
disentangled_losses["loss_box3d_" + component_key] = loss
entangled_l1_dist = (target_corners - box3d_pred.corners).detach().abs().reshape(-1, 24).mean(dim=1)
return disentangled_losses, entangled_l1_dist
# Copyright 2021 Toyota Research Institute. All rights reserved.
# Adapted from AdelaiDet:
# https://github.com/aim-uofa/AdelaiDet
import torch
from fvcore.nn import sigmoid_focal_loss
from torch import nn
from torch.nn import functional as F
from detectron2.layers import Conv2d, batched_nms, cat, get_norm
from detectron2.structures import Boxes, Instances
from detectron2.utils.comm import get_world_size
from mmcv.runner import force_fp32
from projects.mmdet3d_plugin.dd3d.layers.iou_loss import IOULoss
from projects.mmdet3d_plugin.dd3d.layers.normalization import ModuleListDial, Scale
from projects.mmdet3d_plugin.dd3d.utils.comm import reduce_sum
INF = 100000000
def compute_ctrness_targets(reg_targets):
if len(reg_targets) == 0:
return reg_targets.new_zeros(len(reg_targets))
left_right = reg_targets[:, [0, 2]]
top_bottom = reg_targets[:, [1, 3]]
ctrness = (left_right.min(dim=-1)[0] / left_right.max(dim=-1)[0]) * \
(top_bottom.min(dim=-1)[0] / top_bottom.max(dim=-1)[0])
return torch.sqrt(ctrness)
class FCOS2DHead(nn.Module):
def __init__(self,
num_classes,
input_shape,
num_cls_convs=4,
num_box_convs=4,
norm='BN',
use_deformable=False,
use_scale=True,
box2d_scale_init_factor=1.0,
version='v2'):
super().__init__()
self.num_classes = num_classes
self.in_strides = [shape.stride for shape in input_shape]
self.num_levels = len(input_shape)
self.use_scale = use_scale
self.box2d_scale_init_factor = box2d_scale_init_factor
self._version = version
in_channels = [s.channels for s in input_shape]
assert len(set(in_channels)) == 1, "Each level must have the same channel!"
in_channels = in_channels[0]
if use_deformable:
raise ValueError("Not supported yet.")
head_configs = {'cls': num_cls_convs, 'box2d': num_box_convs}
for head_name, num_convs in head_configs.items():
tower = []
if self._version == "v1":
for _ in range(num_convs):
conv_func = nn.Conv2d
tower.append(conv_func(in_channels, in_channels, kernel_size=3, stride=1, padding=1, bias=True))
if norm == "GN":
raise NotImplementedError()
elif norm == "NaiveGN":
raise NotImplementedError()
elif norm == "BN":
tower.append(ModuleListDial([nn.BatchNorm2d(in_channels) for _ in range(self.num_levels)]))
elif norm == "SyncBN":
raise NotImplementedError()
tower.append(nn.ReLU())
elif self._version == "v2":
for _ in range(num_convs):
if norm in ("BN", "FrozenBN", "SyncBN", "GN"):
# NOTE: need to add norm here!
# Each FPN level has its own batchnorm layer.
# NOTE: do not use dd3d train.py!
# "BN" is converted to "SyncBN" in distributed training (see train.py)
norm_layer = ModuleListDial([get_norm(norm, in_channels) for _ in range(self.num_levels)])
else:
norm_layer = get_norm(norm, in_channels)
tower.append(
Conv2d(
in_channels,
in_channels,
kernel_size=3,
stride=1,
padding=1,
bias=norm_layer is None,
norm=norm_layer,
activation=F.relu
)
)
else:
raise ValueError(f"Invalid FCOS2D version: {self._version}")
self.add_module(f'{head_name}_tower', nn.Sequential(*tower))
self.cls_logits = nn.Conv2d(in_channels, self.num_classes, kernel_size=3, stride=1, padding=1)
self.box2d_reg = nn.Conv2d(in_channels, 4, kernel_size=3, stride=1, padding=1)
self.centerness = nn.Conv2d(in_channels, 1, kernel_size=3, stride=1, padding=1)
if self.use_scale:
if self._version == "v1":
self.scales_reg = nn.ModuleList([
Scale(init_value=stride * self.box2d_scale_init_factor) for stride in self.in_strides
])
else:
self.scales_box2d_reg = nn.ModuleList([
Scale(init_value=stride * self.box2d_scale_init_factor) for stride in self.in_strides
])
self.init_weights()
def init_weights(self):
for tower in [self.cls_tower, self.box2d_tower]:
for l in tower.modules():
if isinstance(l, nn.Conv2d):
torch.nn.init.kaiming_normal_(l.weight, mode='fan_out', nonlinearity='relu')
if l.bias is not None:
torch.nn.init.constant_(l.bias, 0)
predictors = [self.cls_logits, self.box2d_reg, self.centerness]
for modules in predictors:
for l in modules.modules():
if isinstance(l, nn.Conv2d):
torch.nn.init.kaiming_uniform_(l.weight, a=1)
if l.bias is not None: # depth head may not have bias.
torch.nn.init.constant_(l.bias, 0)
def forward(self, x):
logits = []
box2d_reg = []
centerness = []
extra_output = {"cls_tower_out": []}
for l, feature in enumerate(x):
cls_tower_out = self.cls_tower(feature)
bbox_tower_out = self.box2d_tower(feature)
# 2D box
logits.append(self.cls_logits(cls_tower_out))
centerness.append(self.centerness(bbox_tower_out))
box_reg = self.box2d_reg(bbox_tower_out)
if self.use_scale:
# TODO: to optimize the runtime, apply this scaling in inference (and loss compute) only on FG pixels?
if self._version == "v1":
box_reg = self.scales_reg[l](box_reg)
else:
box_reg = self.scales_box2d_reg[l](box_reg)
# Note that we use relu, as in the improved FCOS, instead of exp.
box2d_reg.append(F.relu(box_reg))
extra_output['cls_tower_out'].append(cls_tower_out)
return logits, box2d_reg, centerness, extra_output
class FCOS2DLoss(nn.Module):
def __init__(self,
num_classes,
focal_loss_alpha=0.25,
focal_loss_gamma=2.0,
loc_loss_type='giou',
):
super().__init__()
self.focal_loss_alpha = focal_loss_alpha
self.focal_loss_gamma = focal_loss_gamma
self.box2d_reg_loss_fn = IOULoss(loc_loss_type)
self.num_classes = num_classes
@force_fp32(apply_to=('logits', 'box2d_reg', 'centerness'))
def forward(self, logits, box2d_reg, centerness, targets):
labels = targets['labels']
box2d_reg_targets = targets['box2d_reg_targets']
pos_inds = targets["pos_inds"]
if len(labels) != box2d_reg_targets.shape[0]:
raise ValueError(
f"The size of 'labels' and 'box2d_reg_targets' does not match: a={len(labels)}, b={box2d_reg_targets.shape[0]}"
)
# Flatten predictions
logits = cat([x.permute(0, 2, 3, 1).reshape(-1, self.num_classes) for x in logits])
box2d_reg_pred = cat([x.permute(0, 2, 3, 1).reshape(-1, 4) for x in box2d_reg])
centerness_pred = cat([x.permute(0, 2, 3, 1).reshape(-1) for x in centerness])
# -------------------
# Classification loss
# -------------------
num_pos_local = pos_inds.numel()
num_gpus = get_world_size()
total_num_pos = reduce_sum(pos_inds.new_tensor([num_pos_local])).item()
num_pos_avg = max(total_num_pos / num_gpus, 1.0)
# prepare one_hot
cls_target = torch.zeros_like(logits)
cls_target[pos_inds, labels[pos_inds]] = 1
loss_cls = sigmoid_focal_loss(
logits,
cls_target,
alpha=self.focal_loss_alpha,
gamma=self.focal_loss_gamma,
reduction="sum",
) / num_pos_avg
# NOTE: The rest of losses only consider foreground pixels.
box2d_reg_pred = box2d_reg_pred[pos_inds]
box2d_reg_targets = box2d_reg_targets[pos_inds]
centerness_pred = centerness_pred[pos_inds]
# Compute centerness targets here using 2D regression targets of foreground pixels.
centerness_targets = compute_ctrness_targets(box2d_reg_targets)
# Denominator for all foreground losses.
ctrness_targets_sum = centerness_targets.sum()
loss_denom = max(reduce_sum(ctrness_targets_sum).item() / num_gpus, 1e-6)
# NOTE: change the return after reduce_sum
if pos_inds.numel() == 0:
losses = {
"loss_cls": loss_cls,
"loss_box2d_reg": box2d_reg_pred.sum() * 0.,
"loss_centerness": centerness_pred.sum() * 0.,
}
return losses, {}
# ----------------------
# 2D box regression loss
# ----------------------
loss_box2d_reg = self.box2d_reg_loss_fn(box2d_reg_pred, box2d_reg_targets, centerness_targets) / loss_denom
# ---------------
# Centerness loss
# ---------------
loss_centerness = F.binary_cross_entropy_with_logits(
centerness_pred, centerness_targets, reduction="sum"
) / num_pos_avg
loss_dict = {"loss_cls": loss_cls, "loss_box2d_reg": loss_box2d_reg, "loss_centerness": loss_centerness}
extra_info = {"loss_denom": loss_denom, "centerness_targets": centerness_targets}
return loss_dict, extra_info
class FCOS2DInference():
def __init__(self, cfg):
self.thresh_with_ctr = cfg.DD3D.FCOS2D.INFERENCE.THRESH_WITH_CTR
self.pre_nms_thresh = cfg.DD3D.FCOS2D.INFERENCE.PRE_NMS_THRESH
self.pre_nms_topk = cfg.DD3D.FCOS2D.INFERENCE.PRE_NMS_TOPK
self.post_nms_topk = cfg.DD3D.FCOS2D.INFERENCE.POST_NMS_TOPK
self.nms_thresh = cfg.DD3D.FCOS2D.INFERENCE.NMS_THRESH
self.num_classes = cfg.DD3D.NUM_CLASSES
def __call__(self, logits, box2d_reg, centerness, locations, image_sizes):
pred_instances = [] # List[List[Instances]], shape = (L, B)
extra_info = []
for lvl, (logits_lvl, box2d_reg_lvl, centerness_lvl, locations_lvl) in \
enumerate(zip(logits, box2d_reg, centerness, locations)):
instances_per_lvl, extra_info_per_lvl = self.forward_for_single_feature_map(
logits_lvl, box2d_reg_lvl, centerness_lvl, locations_lvl, image_sizes
) # List of Instances; one for each image.
for instances_per_im in instances_per_lvl:
instances_per_im.fpn_levels = locations_lvl.new_ones(len(instances_per_im), dtype=torch.long) * lvl
pred_instances.append(instances_per_lvl)
extra_info.append(extra_info_per_lvl)
return pred_instances, extra_info
def forward_for_single_feature_map(self, logits, box2d_reg, centerness, locations, image_sizes):
N, C, _, __ = logits.shape
# put in the same format as locations
scores = logits.permute(0, 2, 3, 1).reshape(N, -1, C).sigmoid()
box2d_reg = box2d_reg.permute(0, 2, 3, 1).reshape(N, -1, 4)
centerness = centerness.permute(0, 2, 3, 1).reshape(N, -1).sigmoid()
# if self.thresh_with_ctr is True, we multiply the classification
# scores with centerness scores before applying the threshold.
if self.thresh_with_ctr:
scores = scores * centerness[:, :, None]
candidate_mask = scores > self.pre_nms_thresh
pre_nms_topk = candidate_mask.reshape(N, -1).sum(1)
pre_nms_topk = pre_nms_topk.clamp(max=self.pre_nms_topk)
if not self.thresh_with_ctr:
scores = scores * centerness[:, :, None]
results = []
all_fg_inds_per_im, all_topk_indices, all_class_inds_per_im = [], [], []
for i in range(N):
scores_per_im = scores[i]
candidate_mask_per_im = candidate_mask[i]
scores_per_im = scores_per_im[candidate_mask_per_im]
candidate_inds_per_im = candidate_mask_per_im.nonzero(as_tuple=False)
fg_inds_per_im = candidate_inds_per_im[:, 0]
class_inds_per_im = candidate_inds_per_im[:, 1]
# Cache info here.
all_fg_inds_per_im.append(fg_inds_per_im)
all_class_inds_per_im.append(class_inds_per_im)
box2d_reg_per_im = box2d_reg[i][fg_inds_per_im]
locations_per_im = locations[fg_inds_per_im]
pre_nms_topk_per_im = pre_nms_topk[i]
if candidate_mask_per_im.sum().item() > pre_nms_topk_per_im.item():
scores_per_im, topk_indices = \
scores_per_im.topk(pre_nms_topk_per_im, sorted=False)
class_inds_per_im = class_inds_per_im[topk_indices]
box2d_reg_per_im = box2d_reg_per_im[topk_indices]
locations_per_im = locations_per_im[topk_indices]
else:
topk_indices = None
all_topk_indices.append(topk_indices)
detections = torch.stack([
locations_per_im[:, 0] - box2d_reg_per_im[:, 0],
locations_per_im[:, 1] - box2d_reg_per_im[:, 1],
locations_per_im[:, 0] + box2d_reg_per_im[:, 2],
locations_per_im[:, 1] + box2d_reg_per_im[:, 3],
],
dim=1)
instances = Instances(image_sizes[i])
instances.pred_boxes = Boxes(detections)
instances.scores = torch.sqrt(scores_per_im)
instances.pred_classes = class_inds_per_im
instances.locations = locations_per_im
results.append(instances)
extra_info = {
"fg_inds_per_im": all_fg_inds_per_im,
"class_inds_per_im": all_class_inds_per_im,
"topk_indices": all_topk_indices
}
return results, extra_info
def nms_and_top_k(self, instances_per_im, score_key_for_nms="scores"):
results = []
for instances in instances_per_im:
if self.nms_thresh > 0:
# Multiclass NMS.
keep = batched_nms(
instances.pred_boxes.tensor, instances.get(score_key_for_nms), instances.pred_classes,
self.nms_thresh
)
instances = instances[keep]
num_detections = len(instances)
# Limit to max_per_image detections **over all classes**
if num_detections > self.post_nms_topk > 0:
scores = instances.scores
# image_thresh, _ = torch.kthvalue(scores.cpu(), num_detections - self.post_nms_topk + 1)
image_thresh, _ = torch.kthvalue(scores, num_detections - self.post_nms_topk + 1)
keep = scores >= image_thresh.item()
keep = torch.nonzero(keep).squeeze(1)
instances = instances[keep]
results.append(instances)
return results
# Copyright 2021 Toyota Research Institute. All rights reserved.
import torch
import torch.nn.functional as F
from torch import nn
from detectron2.layers import Conv2d, cat, get_norm
from mmcv.runner import force_fp32
from projects.mmdet3d_plugin.dd3d.layers.normalization import ModuleListDial, Offset, Scale
from .disentangled_box3d_loss import DisentangledBox3DLoss
from projects.mmdet3d_plugin.dd3d.structures.boxes3d import Boxes3D
from projects.mmdet3d_plugin.dd3d.utils.geometry import allocentric_to_egocentric, unproject_points2d
EPS = 1e-7
def predictions_to_boxes3d(
quat,
proj_ctr,
depth,
size,
locations,
inv_intrinsics,
canon_box_sizes,
min_depth,
max_depth,
scale_depth_by_focal_lengths_factor,
scale_depth_by_focal_lengths=True,
quat_is_allocentric=True,
depth_is_distance=False
):
# Normalize to make quat unit norm.
quat = quat / quat.norm(dim=1, keepdim=True).clamp(min=EPS)
# Make sure again it's numerically unit-norm.
quat = quat / quat.norm(dim=1, keepdim=True)
if scale_depth_by_focal_lengths:
pixel_size = torch.norm(torch.stack([inv_intrinsics[:, 0, 0], inv_intrinsics[:, 1, 1]], dim=-1), dim=-1)
depth = depth / (pixel_size * scale_depth_by_focal_lengths_factor)
if depth_is_distance:
depth = depth / unproject_points2d(locations, inv_intrinsics).norm(dim=1).clamp(min=EPS)
depth = depth.reshape(-1, 1).clamp(min_depth, max_depth)
proj_ctr = proj_ctr + locations
if quat_is_allocentric:
quat = allocentric_to_egocentric(quat, proj_ctr, inv_intrinsics)
size = (size.tanh() + 1.) * canon_box_sizes # max size = 2 * canon_size
return Boxes3D(quat, proj_ctr, depth, size, inv_intrinsics)
class FCOS3DHead(nn.Module):
def __init__(self,
num_classes,
input_shape,
num_convs=4,
norm='BN',
use_scale=True,
depth_scale_init_factor=0.3,
proj_ctr_scale_init_factor=1.0,
use_per_level_predictors=False,
class_agnostic=False,
use_deformable=False,
mean_depth_per_level=None,
std_depth_per_level=None,
):
super().__init__()
self.num_classes = num_classes
self.in_strides = [shape.stride for shape in input_shape]
self.num_levels = len(input_shape)
self.use_scale = use_scale
self.depth_scale_init_factor = depth_scale_init_factor
self.proj_ctr_scale_init_factor = proj_ctr_scale_init_factor
self.use_per_level_predictors = use_per_level_predictors
self.register_buffer("mean_depth_per_level", torch.Tensor(mean_depth_per_level))
self.register_buffer("std_depth_per_level", torch.Tensor(std_depth_per_level))
in_channels = [s.channels for s in input_shape]
assert len(set(in_channels)) == 1, "Each level must have the same channel!"
in_channels = in_channels[0]
if use_deformable:
raise ValueError("Not supported yet.")
box3d_tower = []
for i in range(num_convs):
if norm in ("BN", "FrozenBN", "SyncBN", "GN"):
# NOTE: need to add norm here!
# Each FPN level has its own batchnorm layer.
# NOTE: do not use dd3d train.py!
# "BN" is converted to "SyncBN" in distributed training (see train.py)
norm_layer = ModuleListDial([get_norm(norm, in_channels) for _ in range(self.num_levels)])
else:
norm_layer = get_norm(norm, in_channels)
box3d_tower.append(
Conv2d(
in_channels,
in_channels,
kernel_size=3,
stride=1,
padding=1,
bias=norm_layer is None,
norm=norm_layer,
activation=F.relu
)
)
self.add_module('box3d_tower', nn.Sequential(*box3d_tower))
num_classes = self.num_classes if not class_agnostic else 1
num_levels = self.num_levels if use_per_level_predictors else 1
# 3D box branches.
self.box3d_quat = nn.ModuleList([
Conv2d(in_channels, 4 * num_classes, kernel_size=3, stride=1, padding=1, bias=True)
for _ in range(num_levels)
])
self.box3d_ctr = nn.ModuleList([
Conv2d(in_channels, 2 * num_classes, kernel_size=3, stride=1, padding=1, bias=True)
for _ in range(num_levels)
])
self.box3d_depth = nn.ModuleList([
Conv2d(in_channels, 1 * num_classes, kernel_size=3, stride=1, padding=1, bias=(not self.use_scale))
for _ in range(num_levels)
])
self.box3d_size = nn.ModuleList([
Conv2d(in_channels, 3 * num_classes, kernel_size=3, stride=1, padding=1, bias=True)
for _ in range(num_levels)
])
self.box3d_conf = nn.ModuleList([
Conv2d(in_channels, 1 * num_classes, kernel_size=3, stride=1, padding=1, bias=True)
for _ in range(num_levels)
])
if self.use_scale:
self.scales_proj_ctr = nn.ModuleList([
Scale(init_value=stride * self.proj_ctr_scale_init_factor) for stride in self.in_strides
])
# (pre-)compute (mean, std) of depth for each level, and determine the init value here.
self.scales_size = nn.ModuleList([Scale(init_value=1.0) for _ in range(self.num_levels)])
self.scales_conf = nn.ModuleList([Scale(init_value=1.0) for _ in range(self.num_levels)])
self.scales_depth = nn.ModuleList([
Scale(init_value=sigma * self.depth_scale_init_factor) for sigma in self.std_depth_per_level
])
self.offsets_depth = nn.ModuleList([Offset(init_value=b) for b in self.mean_depth_per_level])
self._init_weights()
def _init_weights(self):
for l in self.box3d_tower.modules():
if isinstance(l, nn.Conv2d):
torch.nn.init.kaiming_normal_(l.weight, mode='fan_out', nonlinearity='relu')
if l.bias is not None:
torch.nn.init.constant_(l.bias, 0)
predictors = [self.box3d_quat, self.box3d_ctr, self.box3d_depth, self.box3d_size, self.box3d_conf]
for modules in predictors:
for l in modules.modules():
if isinstance(l, nn.Conv2d):
torch.nn.init.kaiming_uniform_(l.weight, a=1)
if l.bias is not None: # depth head may not have bias.
torch.nn.init.constant_(l.bias, 0)
def forward(self, x):
box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf = [], [], [], [], []
dense_depth = None
for l, features in enumerate(x):
box3d_tower_out = self.box3d_tower(features)
_l = l if self.use_per_level_predictors else 0
# 3D box
quat = self.box3d_quat[_l](box3d_tower_out)
proj_ctr = self.box3d_ctr[_l](box3d_tower_out)
depth = self.box3d_depth[_l](box3d_tower_out)
size3d = self.box3d_size[_l](box3d_tower_out)
conf3d = self.box3d_conf[_l](box3d_tower_out)
if self.use_scale:
# TODO: to optimize the runtime, apply this scaling in inference (and loss compute) only on FG pixels?
proj_ctr = self.scales_proj_ctr[l](proj_ctr)
size3d = self.scales_size[l](size3d)
conf3d = self.scales_conf[l](conf3d)
depth = self.offsets_depth[l](self.scales_depth[l](depth))
box3d_quat.append(quat)
box3d_ctr.append(proj_ctr)
box3d_depth.append(depth)
box3d_size.append(size3d)
box3d_conf.append(conf3d)
return box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf, dense_depth
class FCOS3DLoss(nn.Module):
def __init__(self,
num_classes,
min_depth=0.1,
max_depth=80.0,
box3d_loss_weight=2.0,
conf3d_loss_weight=1.0,
conf_3d_temperature=1.0,
smooth_l1_loss_beta=0.05,
max_loss_per_group=20,
predict_allocentric_rot=True,
scale_depth_by_focal_lengths=True,
scale_depth_by_focal_lengths_factor=500.0,
class_agnostic=False,
predict_distance=False,
canon_box_sizes=None):
super().__init__()
self.canon_box_sizes = canon_box_sizes
self.min_depth = min_depth
self.max_depth = max_depth
self.predict_allocentric_rot = predict_allocentric_rot
self.scale_depth_by_focal_lengths = scale_depth_by_focal_lengths
self.scale_depth_by_focal_lengths_factor = scale_depth_by_focal_lengths_factor
self.predict_distance = predict_distance
self.box3d_reg_loss_fn = DisentangledBox3DLoss(smooth_l1_loss_beta, max_loss_per_group)
self.box3d_loss_weight = box3d_loss_weight
self.conf3d_loss_weight = conf3d_loss_weight
self.conf_3d_temperature = conf_3d_temperature
self.num_classes = num_classes
self.class_agnostic = class_agnostic
@force_fp32(apply_to=('box3d_quat', 'box3d_ctr', 'box3d_depth', 'box3d_size','box3d_conf', 'inv_intrinsics'))
def forward(
self, box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf, dense_depth, inv_intrinsics, fcos2d_info,
targets
):
labels = targets['labels']
box3d_targets = targets['box3d_targets']
pos_inds = targets["pos_inds"]
if pos_inds.numel() == 0:
losses = {
"loss_box3d_quat": torch.stack([x.sum() * 0. for x in box3d_quat]).sum(),
"loss_box3d_proj_ctr": torch.stack([x.sum() * 0. for x in box3d_ctr]).sum(),
"loss_box3d_depth": torch.stack([x.sum() * 0. for x in box3d_depth]).sum(),
"loss_box3d_size": torch.stack([x.sum() * 0. for x in box3d_size]).sum(),
"loss_conf3d": torch.stack([x.sum() * 0. for x in box3d_conf]).sum()
}
return losses
if len(labels) != len(box3d_targets):
raise ValueError(
f"The size of 'labels' and 'box3d_targets' does not match: a={len(labels)}, b={len(box3d_targets)}"
)
num_classes = self.num_classes if not self.class_agnostic else 1
box3d_quat_pred = cat([x.permute(0, 2, 3, 1).reshape(-1, 4, num_classes) for x in box3d_quat])
box3d_ctr_pred = cat([x.permute(0, 2, 3, 1).reshape(-1, 2, num_classes) for x in box3d_ctr])
box3d_depth_pred = cat([x.permute(0, 2, 3, 1).reshape(-1, num_classes) for x in box3d_depth])
box3d_size_pred = cat([x.permute(0, 2, 3, 1).reshape(-1, 3, num_classes) for x in box3d_size])
box3d_conf_pred = cat([x.permute(0, 2, 3, 1).reshape(-1, num_classes) for x in box3d_conf])
# ----------------------
# 3D box disentangled loss
# ----------------------
box3d_targets = box3d_targets[pos_inds]
box3d_quat_pred = box3d_quat_pred[pos_inds]
box3d_ctr_pred = box3d_ctr_pred[pos_inds]
box3d_depth_pred = box3d_depth_pred[pos_inds]
box3d_size_pred = box3d_size_pred[pos_inds]
box3d_conf_pred = box3d_conf_pred[pos_inds]
if self.class_agnostic:
box3d_quat_pred = box3d_quat_pred.squeeze(-1)
box3d_ctr_pred = box3d_ctr_pred.squeeze(-1)
box3d_depth_pred = box3d_depth_pred.squeeze(-1)
box3d_size_pred = box3d_size_pred.squeeze(-1)
box3d_conf_pred = box3d_conf_pred.squeeze(-1)
else:
I = labels[pos_inds][..., None, None]
box3d_quat_pred = torch.gather(box3d_quat_pred, dim=2, index=I.repeat(1, 4, 1)).squeeze(-1)
box3d_ctr_pred = torch.gather(box3d_ctr_pred, dim=2, index=I.repeat(1, 2, 1)).squeeze(-1)
box3d_depth_pred = torch.gather(box3d_depth_pred, dim=1, index=I.squeeze(-1)).squeeze(-1)
box3d_size_pred = torch.gather(box3d_size_pred, dim=2, index=I.repeat(1, 3, 1)).squeeze(-1)
box3d_conf_pred = torch.gather(box3d_conf_pred, dim=1, index=I.squeeze(-1)).squeeze(-1)
canon_box_sizes = box3d_quat_pred.new_tensor(self.canon_box_sizes)[labels[pos_inds]]
locations = targets["locations"][pos_inds]
im_inds = targets["im_inds"][pos_inds]
inv_intrinsics = inv_intrinsics[im_inds]
box3d_pred = predictions_to_boxes3d(
box3d_quat_pred,
box3d_ctr_pred,
box3d_depth_pred,
box3d_size_pred,
locations,
inv_intrinsics,
canon_box_sizes,
self.min_depth,
self.max_depth,
scale_depth_by_focal_lengths_factor=self.scale_depth_by_focal_lengths_factor,
scale_depth_by_focal_lengths=self.scale_depth_by_focal_lengths,
quat_is_allocentric=self.predict_allocentric_rot,
depth_is_distance=self.predict_distance
)
centerness_targets = fcos2d_info["centerness_targets"]
loss_denom = fcos2d_info["loss_denom"]
losses_box3d, box3d_l1_error = self.box3d_reg_loss_fn(box3d_pred, box3d_targets, locations, centerness_targets)
losses_box3d = {k: self.box3d_loss_weight * v / loss_denom for k, v in losses_box3d.items()}
conf_3d_targets = torch.exp(-1. / self.conf_3d_temperature * box3d_l1_error)
loss_conf3d = F.binary_cross_entropy_with_logits(box3d_conf_pred, conf_3d_targets, reduction='none')
loss_conf3d = self.conf3d_loss_weight * (loss_conf3d * centerness_targets).sum() / loss_denom
losses = {"loss_conf3d": loss_conf3d, **losses_box3d}
return losses
class FCOS3DInference():
def __init__(self, cfg):
self.canon_box_sizes = cfg.DD3D.FCOS3D.CANONICAL_BOX3D_SIZES
self.min_depth = cfg.DD3D.FCOS3D.MIN_DEPTH
self.max_depth = cfg.DD3D.FCOS3D.MAX_DEPTH
self.predict_allocentric_rot = cfg.DD3D.FCOS3D.PREDICT_ALLOCENTRIC_ROT
self.scale_depth_by_focal_lengths = cfg.DD3D.FCOS3D.SCALE_DEPTH_BY_FOCAL_LENGTHS
self.scale_depth_by_focal_lengths_factor = cfg.DD3D.FCOS3D.SCALE_DEPTH_BY_FOCAL_LENGTHS_FACTOR
self.predict_distance = cfg.DD3D.FCOS3D.PREDICT_DISTANCE
self.num_classes = cfg.DD3D.NUM_CLASSES
self.class_agnostic = cfg.DD3D.FCOS3D.CLASS_AGNOSTIC_BOX3D
def __call__(
self, box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf, inv_intrinsics, pred_instances, fcos2d_info
):
# pred_instances: # List[List[Instances]], shape = (L, B)
for lvl, (box3d_quat_lvl, box3d_ctr_lvl, box3d_depth_lvl, box3d_size_lvl, box3d_conf_lvl) in \
enumerate(zip(box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf)):
# In-place modification: update per-level pred_instances.
self.forward_for_single_feature_map(
box3d_quat_lvl, box3d_ctr_lvl, box3d_depth_lvl, box3d_size_lvl, box3d_conf_lvl, inv_intrinsics,
pred_instances[lvl], fcos2d_info[lvl]
) # List of Instances; one for each image.
def forward_for_single_feature_map(
self, box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf, inv_intrinsics, pred_instances, fcos2d_info
):
N = box3d_quat.shape[0]
num_classes = self.num_classes if not self.class_agnostic else 1
box3d_quat = box3d_quat.permute(0, 2, 3, 1).reshape(N, -1, 4, num_classes)
box3d_ctr = box3d_ctr.permute(0, 2, 3, 1).reshape(N, -1, 2, num_classes)
box3d_depth = box3d_depth.permute(0, 2, 3, 1).reshape(N, -1, num_classes)
box3d_size = box3d_size.permute(0, 2, 3, 1).reshape(N, -1, 3, num_classes)
box3d_conf = box3d_conf.permute(0, 2, 3, 1).reshape(N, -1, num_classes).sigmoid()
for i in range(N):
fg_inds_per_im = fcos2d_info['fg_inds_per_im'][i]
class_inds_per_im = fcos2d_info['class_inds_per_im'][i]
topk_indices = fcos2d_info['topk_indices'][i]
box3d_quat_per_im = box3d_quat[i][fg_inds_per_im]
box3d_ctr_per_im = box3d_ctr[i][fg_inds_per_im]
box3d_depth_per_im = box3d_depth[i][fg_inds_per_im]
box3d_size_per_im = box3d_size[i][fg_inds_per_im]
box3d_conf_per_im = box3d_conf[i][fg_inds_per_im]
if self.class_agnostic:
box3d_quat_per_im = box3d_quat_per_im.squeeze(-1)
box3d_ctr_per_im = box3d_ctr_per_im.squeeze(-1)
box3d_depth_per_im = box3d_depth_per_im.squeeze(-1)
box3d_size_per_im = box3d_size_per_im.squeeze(-1)
box3d_conf_per_im = box3d_conf_per_im.squeeze(-1)
else:
I = class_inds_per_im[..., None, None]
box3d_quat_per_im = torch.gather(box3d_quat_per_im, dim=2, index=I.repeat(1, 4, 1)).squeeze(-1)
box3d_ctr_per_im = torch.gather(box3d_ctr_per_im, dim=2, index=I.repeat(1, 2, 1)).squeeze(-1)
box3d_depth_per_im = torch.gather(box3d_depth_per_im, dim=1, index=I.squeeze(-1)).squeeze(-1)
box3d_size_per_im = torch.gather(box3d_size_per_im, dim=2, index=I.repeat(1, 3, 1)).squeeze(-1)
box3d_conf_per_im = torch.gather(box3d_conf_per_im, dim=1, index=I.squeeze(-1)).squeeze(-1)
if topk_indices is not None:
box3d_quat_per_im = box3d_quat_per_im[topk_indices]
box3d_ctr_per_im = box3d_ctr_per_im[topk_indices]
box3d_depth_per_im = box3d_depth_per_im[topk_indices]
box3d_size_per_im = box3d_size_per_im[topk_indices]
box3d_conf_per_im = box3d_conf_per_im[topk_indices]
# scores_per_im = pred_instances[i].scores.square()
# NOTE: Before refactoring, the squared score was used. Is raw 2D score better?
scores_per_im = pred_instances[i].scores
scores_3d_per_im = scores_per_im * box3d_conf_per_im
canon_box_sizes = box3d_quat.new_tensor(self.canon_box_sizes)[pred_instances[i].pred_classes]
inv_K = inv_intrinsics[i][None, ...].expand(len(box3d_quat_per_im), 3, 3)
locations = pred_instances[i].locations
pred_boxes3d = predictions_to_boxes3d(
box3d_quat_per_im,
box3d_ctr_per_im,
box3d_depth_per_im,
box3d_size_per_im,
locations,
inv_K,
canon_box_sizes,
self.min_depth,
self.max_depth,
scale_depth_by_focal_lengths_factor=self.scale_depth_by_focal_lengths_factor,
scale_depth_by_focal_lengths=self.scale_depth_by_focal_lengths,
quat_is_allocentric=self.predict_allocentric_rot,
depth_is_distance=self.predict_distance
)
# In-place modification: add fields to instances.
pred_instances[i].pred_boxes3d = pred_boxes3d
pred_instances[i].scores_3d = scores_3d_per_im
# Copyright 2021 Toyota Research Institute. All rights reserved.
import torch
import torch.nn.functional as F
from fvcore.nn.smooth_l1_loss import smooth_l1_loss
from torch import nn
from detectron2.layers import Conv2d, cat
#from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY
from detectron2.modeling.postprocessing import detector_postprocess as resize_instances
from detectron2.structures import Instances
from detectron2.utils import comm as d2_comm
from mmdet.models.builder import HEADS
from mmcv.runner import force_fp32
from projects.mmdet3d_plugin.dd3d.datasets.nuscenes import MAX_NUM_ATTRIBUTES
from .core import DD3D
#from tridet.modeling.dd3d.postprocessing import get_group_idxs, nuscenes_sample_aggregate
from .prepare_targets import DD3DTargetPreparer
from projects.mmdet3d_plugin.dd3d.structures.boxes3d import Boxes3D
from projects.mmdet3d_plugin.dd3d.structures.image_list import ImageList
from projects.mmdet3d_plugin.dd3d.utils.comm import reduce_sum
INF = 100000000.
class NuscenesDD3DTargetPreparer(DD3DTargetPreparer):
def __init__(self, **kwargs):
super().__init__(**kwargs)
assert self.dd3d_enabled, f"{type(self).__name__} requires dd3d_enabled = True"
def __call__(self, locations, gt_instances, feature_shapes):
num_loc_list = [len(loc) for loc in locations]
# compute locations to size ranges
loc_to_size_range = []
for l, loc_per_level in enumerate(locations):
loc_to_size_range_per_level = loc_per_level.new_tensor(self.sizes_of_interest[l])
loc_to_size_range.append(loc_to_size_range_per_level[None].expand(num_loc_list[l], -1))
loc_to_size_range = torch.cat(loc_to_size_range, dim=0)
locations = torch.cat(locations, dim=0)
training_targets = self.compute_targets_for_locations(locations, gt_instances, loc_to_size_range, num_loc_list)
training_targets["locations"] = [locations.clone() for _ in range(len(gt_instances))]
training_targets["im_inds"] = [
locations.new_ones(locations.size(0), dtype=torch.long) * i for i in range(len(gt_instances))
]
box2d = training_targets.pop("box2d", None)
# transpose im first training_targets to level first ones
training_targets = {k: self._transpose(v, num_loc_list) for k, v in training_targets.items() if k != "box2d"}
training_targets["fpn_levels"] = [
loc.new_ones(len(loc), dtype=torch.long) * level for level, loc in enumerate(training_targets["locations"])
]
# Flatten targets: (L x B x H x W, TARGET_SIZE)
labels = cat([x.reshape(-1) for x in training_targets["labels"]])
box2d_reg_targets = cat([x.reshape(-1, 4) for x in training_targets["box2d_reg"]])
target_inds = cat([x.reshape(-1) for x in training_targets["target_inds"]])
locations = cat([x.reshape(-1, 2) for x in training_targets["locations"]])
im_inds = cat([x.reshape(-1) for x in training_targets["im_inds"]])
fpn_levels = cat([x.reshape(-1) for x in training_targets["fpn_levels"]])
pos_inds = torch.nonzero(labels != self.num_classes).squeeze(1)
targets = {
"labels": labels,
"box2d_reg_targets": box2d_reg_targets,
"locations": locations,
"target_inds": target_inds,
"im_inds": im_inds,
"fpn_levels": fpn_levels,
"pos_inds": pos_inds
}
if self.dd3d_enabled:
box3d_targets = Boxes3D.cat(training_targets["box3d"])
targets.update({"box3d_targets": box3d_targets})
if box2d is not None:
# Original format is B x L x (H x W, 4)
# Need to be in L x (B, 4, H, W).
batched_box2d = []
for lvl, per_lvl_box2d in enumerate(zip(*box2d)):
# B x (H x W, 4)
h, w = feature_shapes[lvl]
batched_box2d_lvl = torch.stack([x.T.reshape(4, h, w) for x in per_lvl_box2d], dim=0)
batched_box2d.append(batched_box2d_lvl)
targets.update({"batched_box2d": batched_box2d})
# Nuscenes targets -- attribute / speed
attributes = cat([x.reshape(-1) for x in training_targets["attributes"]])
speeds = cat([x.reshape(-1) for x in training_targets["speeds"]])
targets.update({'attributes': attributes, 'speeds': speeds})
return targets
def compute_targets_for_locations(self, locations, targets, size_ranges, num_loc_list):
labels = []
box2d_reg = []
if self.dd3d_enabled:
box3d = []
target_inds = []
xs, ys = locations[:, 0], locations[:, 1]
# NuScenes targets -- attribute / speed
attributes, speeds = [], []
num_targets = 0
for im_i in range(len(targets)):
targets_per_im = targets[im_i]
bboxes = targets_per_im.gt_boxes.tensor
labels_per_im = targets_per_im.gt_classes
# no gt
if bboxes.numel() == 0:
labels.append(labels_per_im.new_zeros(locations.size(0)) + self.num_classes)
# reg_targets.append(locations.new_zeros((locations.size(0), 4)))
box2d_reg.append(locations.new_zeros((locations.size(0), 4)))
target_inds.append(labels_per_im.new_zeros(locations.size(0)) - 1)
if self.dd3d_enabled:
box3d.append(
Boxes3D(
locations.new_zeros(locations.size(0), 4),
locations.new_zeros(locations.size(0), 2),
locations.new_zeros(locations.size(0), 1),
locations.new_zeros(locations.size(0), 3),
locations.new_zeros(locations.size(0), 3, 3),
).to(torch.float32)
)
# NOTE: attributes and speeds.
attributes.append(labels_per_im.new_zeros(locations.size(0)))
speeds.append(labels_per_im.new_zeros(locations.size(0)))
continue
area = targets_per_im.gt_boxes.area()
l = xs[:, None] - bboxes[:, 0][None]
t = ys[:, None] - bboxes[:, 1][None]
r = bboxes[:, 2][None] - xs[:, None]
b = bboxes[:, 3][None] - ys[:, None]
# reg_targets_per_im = torch.stack([l, t, r, b], dim=2)
box2d_reg_per_im = torch.stack([l, t, r, b], dim=2)
if self.center_sample:
is_in_boxes = self.get_sample_region(bboxes, num_loc_list, xs, ys)
else:
is_in_boxes = box2d_reg_per_im.min(dim=2)[0] > 0
max_reg_targets_per_im = box2d_reg_per_im.max(dim=2)[0]
# limit the regression range for each location
is_cared_in_the_level = \
(max_reg_targets_per_im >= size_ranges[:, [0]]) & \
(max_reg_targets_per_im <= size_ranges[:, [1]])
locations_to_gt_area = area[None].repeat(len(locations), 1)
locations_to_gt_area[is_in_boxes == 0] = INF
locations_to_gt_area[is_cared_in_the_level == 0] = INF
# if there are still more than one objects for a location,
# we choose the one with minimal area
locations_to_min_area, locations_to_gt_inds = locations_to_gt_area.min(dim=1)
box2d_reg_per_im = box2d_reg_per_im[range(len(locations)), locations_to_gt_inds]
target_inds_per_im = locations_to_gt_inds + num_targets
num_targets += len(targets_per_im)
labels_per_im = labels_per_im[locations_to_gt_inds]
labels_per_im[locations_to_min_area == INF] = self.num_classes
labels.append(labels_per_im)
box2d_reg.append(box2d_reg_per_im)
target_inds.append(target_inds_per_im)
if self.dd3d_enabled:
# 3D box targets
box3d_per_im = targets_per_im.gt_boxes3d[locations_to_gt_inds]
box3d.append(box3d_per_im)
# NuScenes targets -- attribute / speed
attributes_per_im = targets_per_im.gt_attributes[locations_to_gt_inds]
speeds_per_im = targets_per_im.gt_speeds[locations_to_gt_inds]
attributes.append(attributes_per_im)
speeds.append(speeds_per_im)
ret = {"labels": labels, "box2d_reg": box2d_reg, "target_inds": target_inds}
if self.dd3d_enabled:
ret.update({"box3d": box3d})
# NuScenes targets -- attribute / speed
ret.update({"attributes": attributes, "speeds": speeds})
return ret
class NuscenesLoss(nn.Module):
def __init__(self, attr_loss_weight=0.2, speed_loss_weight=0.2):
super().__init__()
self.attr_loss_weight = attr_loss_weight
self.speed_loss_weight = speed_loss_weight
@force_fp32(apply_to=('attr_logits', 'speeds'))
def forward(self, attr_logits, speeds, fcos2d_info, targets):
# Flatten predictions
attr_logits = cat([x.permute(0, 2, 3, 1).reshape(-1, MAX_NUM_ATTRIBUTES) for x in attr_logits])
speeds = cat([x.permute(0, 2, 3, 1).reshape(-1) for x in speeds])
pos_inds = targets['pos_inds']
losses = {}
# 1. Attributes
attr_logits = attr_logits[pos_inds]
target_attr = targets['attributes'][pos_inds]
valid_attr_mask = target_attr != MAX_NUM_ATTRIBUTES # No attrs associated with class, or just attr missing.
if pos_inds.numel() == 0:
attr_weights = attr_logits.new_tensor(0.0) #torch.tensor(0.0).cuda()
else:
attr_weights = fcos2d_info['centerness_targets'][valid_attr_mask]
# Denominator for all foreground losses -- re-computed for features with valid attributes.
# attr_loss_denom = max(reduce_sum(attr_weights.sum()).item() / d2_comm.get_world_size(), 1e-6)
# NOTE: compute attr_weights_sum, and then feed it to reduce_sum() works, but not above.
attr_weights_sum = attr_weights.sum()
attr_loss_denom = max(reduce_sum(attr_weights_sum).item() / d2_comm.get_world_size(), 1e-6)
if valid_attr_mask.sum() == 0:
losses.update({"loss_attr": attr_logits.sum() * 0.})
else:
attr_logits = attr_logits[valid_attr_mask]
target_attr = target_attr[valid_attr_mask]
xent = F.cross_entropy(attr_logits, target_attr)
loss_attr = (xent * attr_weights).sum() / attr_loss_denom
losses.update({"loss_attr": self.attr_loss_weight * loss_attr})
# 2. Speed
speeds = speeds[pos_inds]
target_speeds = targets['speeds'][pos_inds]
# NOTE: some GT speeds are NaN.
valid_gt_mask = torch.logical_not(torch.isnan(target_speeds))
if pos_inds.numel() == 0:
speed_weights = speeds.new_tensor(0.0) #torch.tensor(0.0).cuda()
else:
speed_weights = fcos2d_info['centerness_targets'][valid_gt_mask]
# Denominator for all foreground losses -- re-computed for features with valid speeds.
# speed_loss_denom = max(reduce_sum(speed_weights.sum()).item() / d2_comm.get_world_size(), 1e-6)
speed_weights_sum = speed_weights.sum()
speed_loss_denom = max(reduce_sum(speed_weights_sum).item() / d2_comm.get_world_size(), 1e-6)
# NOTE: move after reduce sum
if pos_inds.numel() == 0:
losses = {"loss_attr": attr_logits.sum() * 0., "loss_speed": speeds.sum() * 0.}
# NOTE: This is probably un-reachable, because the training filter images with empty annotations.
# NOTE: If not, attr_weights can be unavailable in the reduce_sum below().
return losses
if valid_gt_mask.sum() == 0:
losses.update({"loss_speed": speeds.sum() * 0.})
# return losses
else:
speeds = speeds[valid_gt_mask]
target_speeds = target_speeds[valid_gt_mask]
l1_error = smooth_l1_loss(speeds, target_speeds, beta=0.05)
loss_speed = (l1_error * speed_weights).sum() / speed_loss_denom
losses.update({"loss_speed": self.speed_loss_weight * loss_speed})
return losses
class NuscenesInference():
def __init__(self, cfg):
pass
def __call__(self, attr_logits, speeds, pred_instances, fcos2d_info):
"""Add 'pred_attribute', 'pred_speed' to Instances in 'pred_instances'."""
N = attr_logits[0].shape[0]
for lvl, (attr_logits_lvl, speed_lvl, info_lvl, instances_lvl) in \
enumerate(zip(attr_logits, speeds, fcos2d_info, pred_instances)):
attr_logits_lvl = attr_logits_lvl.permute(0, 2, 3, 1).reshape(N, -1, MAX_NUM_ATTRIBUTES)
speed_lvl = speed_lvl.permute(0, 2, 3, 1).reshape(N, -1)
for i in range(N):
fg_inds_per_im = info_lvl['fg_inds_per_im'][i]
topk_indices = info_lvl['topk_indices'][i]
attr_logits_per_im = attr_logits_lvl[i][fg_inds_per_im]
speed_per_im = speed_lvl[i][fg_inds_per_im]
if topk_indices is not None:
attr_logits_per_im = attr_logits_per_im[topk_indices]
speed_per_im = speed_per_im[topk_indices]
if len(attr_logits_per_im) == 0:
instances_lvl[i].pred_attributes = instances_lvl[i].pred_classes.new_tensor([])
instances_lvl[i].pred_speeds = instances_lvl[i].scores.new_tensor([])
else:
instances_lvl[i].pred_attributes = attr_logits_per_im.argmax(dim=1)
instances_lvl[i].pred_speeds = speed_per_im
@HEADS.register_module()
class NuscenesDD3D(DD3D):
def __init__(self,
num_classes,
in_channels,
strides,
fcos2d_cfg=dict(),
fcos2d_loss_cfg=dict(),
fcos3d_cfg=dict(),
fcos3d_loss_cfg=dict(),
target_assign_cfg=dict(),
nusc_loss_weight=dict(),
box3d_on=True,
feature_locations_offset="none"):
super().__init__(num_classes,
in_channels,
strides,
fcos2d_cfg=fcos2d_cfg,
fcos2d_loss_cfg=fcos2d_loss_cfg,
fcos3d_cfg=fcos3d_cfg,
fcos3d_loss_cfg=fcos3d_loss_cfg,
target_assign_cfg=target_assign_cfg,
box3d_on=box3d_on,
feature_locations_offset=feature_locations_offset)
# backbone_output_shape = self.backbone_output_shape
# in_channels = backbone_output_shape[0].channels
# --------------------------------------------------------------------------
# NuScenes predictions -- attribute / speed, computed from cls_tower output.
# --------------------------------------------------------------------------
self.attr_logits = Conv2d(in_channels, MAX_NUM_ATTRIBUTES, kernel_size=3, stride=1, padding=1, bias=True)
self.speed = Conv2d(in_channels, 1, kernel_size=3, stride=1, padding=1, bias=True, activation=F.relu)
# init weights
for modules in [self.attr_logits, self.speed]:
for l in modules.modules():
if isinstance(l, nn.Conv2d):
torch.nn.init.kaiming_uniform_(l.weight, a=1)
if l.bias is not None: # depth head may not have bias.
torch.nn.init.constant_(l.bias, 0)
# Re-define target preparer
del self.prepare_targets
self.prepare_targets = NuscenesDD3DTargetPreparer(num_classes=num_classes,
input_shape=self.backbone_output_shape,
box3d_on=box3d_on,
**target_assign_cfg)
self.nuscenes_loss = NuscenesLoss(**nusc_loss_weight)
# NOTE: inference later
# self.nuscenes_inference = NuscenesInference(cfg)
# self.num_images_per_sample = cfg.MODEL.FCOS3D.NUSC_NUM_IMAGES_PER_SAMPLE
# NOTE: inference later
# self.num_images_per_sample = cfg.DD3D.NUSC.INFERENCE.NUM_IMAGES_PER_SAMPLE
# assert self.num_images_per_sample == 6
# assert cfg.DATALOADER.TEST.NUM_IMAGES_PER_GROUP == 6
# NOTE: NuScenes evaluator allows max. 500 detections per sample.
# self.max_num_dets_per_sample = cfg.DD3D.NUSC.INFERENCE.MAX_NUM_DETS_PER_SAMPLE
@force_fp32(apply_to=('features'))
def forward(self, features, batched_inputs):
# NOTE:
# images = [x["image"].to(self.device) for x in batched_inputs]
# images = [self.preprocess_image(x) for x in images]
# NOTE: directly use inv_intrinsics
# if 'intrinsics' in batched_inputs[0]:
# intrinsics = [x['intrinsics'].to(self.device) for x in batched_inputs]
# else:
# intrinsics = None
# images = ImageList.from_tensors(images, self.backbone.size_divisibility, intrinsics=intrinsics)
if 'inv_intrinsics' in batched_inputs[0]:
inv_intrinsics = [x['inv_intrinsics'].to(features[0].device) for x in batched_inputs]
inv_intrinsics = torch.stack(inv_intrinsics, dim=0)
else:
inv_intrinsics = None
# NOTE:
# gt_dense_depth = None
# if 'depth' in batched_inputs[0]:
# gt_dense_depth = [x["depth"].to(self.device) for x in batched_inputs]
# gt_dense_depth = ImageList.from_tensors(
# gt_dense_depth, self.backbone.size_divisibility, intrinsics=intrinsics
# )
# NOTE: directly input feature
# features = self.backbone(images.tensor)
# features = [features[f] for f in self.in_features]
if "instances" in batched_inputs[0]:
gt_instances = [x["instances"].to(features[0].device) for x in batched_inputs]
else:
gt_instances = None
locations = self.compute_locations(features)
logits, box2d_reg, centerness, fcos2d_extra_output = self.fcos2d_head(features)
if not self.only_box2d:
box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf, dense_depth = self.fcos3d_head(features)
# NOTE: directly use inv_intrinsics
# inv_intrinsics = images.intrinsics.inverse() if images.intrinsics is not None else None
# --------------------------------------------------------------------------
# NuScenes predictions -- attribute / speed, computed from cls_tower output.
# --------------------------------------------------------------------------
attr_logits, speeds = [], []
for x in fcos2d_extra_output['cls_tower_out']:
attr_logits.append(self.attr_logits(x))
speeds.append(self.speed(x))
if self.training:
assert gt_instances is not None
feature_shapes = [x.shape[-2:] for x in features]
training_targets = self.prepare_targets(locations, gt_instances, feature_shapes)
# NOTE:
# if gt_dense_depth is not None:
# training_targets.update({"dense_depth": gt_dense_depth})
losses = {}
fcos2d_loss, fcos2d_info = self.fcos2d_loss(logits, box2d_reg, centerness, training_targets)
losses.update(fcos2d_loss)
if not self.only_box2d:
fcos3d_loss = self.fcos3d_loss(
box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf, dense_depth, inv_intrinsics,
fcos2d_info, training_targets
)
losses.update(fcos3d_loss)
# Nuscenes loss -- attribute / speed
nuscenes_loss = self.nuscenes_loss(attr_logits, speeds, fcos2d_info, training_targets)
losses.update(nuscenes_loss)
return losses
else:
# TODO: do not support inference now
raise NotImplementedError
pred_instances, fcos2d_info = self.fcos2d_inference(
logits, box2d_reg, centerness, locations, images.image_sizes
)
if not self.only_box2d:
# This adds 'pred_boxes3d' and 'scores_3d' to Instances in 'pred_instances'.
self.fcos3d_inference(
box3d_quat, box3d_ctr, box3d_depth, box3d_size, box3d_conf, inv_intrinsics, pred_instances,
fcos2d_info
)
score_key = "scores_3d"
else:
score_key = "scores"
# This adds 'pred_attributes', 'pred_speed' to Instances in 'pred_instances'.
self.nuscenes_inference(attr_logits, speeds, pred_instances, fcos2d_info)
# Transpose to "image-first", i.e. (B, L)
pred_instances = list(zip(*pred_instances))
pred_instances = [Instances.cat(instances) for instances in pred_instances]
# 2D NMS and pick top-K.
if self.do_nms:
pred_instances = self.fcos2d_inference.nms_and_top_k(pred_instances, score_key)
if not self.only_box2d and self.do_bev_nms:
# Bird-eye-view NMS.
dummy_group_idxs = {i: [i] for i, _ in enumerate(pred_instances)}
if 'pose' in batched_inputs[0]:
poses = [x['pose'] for x in batched_inputs]
else:
poses = [x['extrinsics'] for x in batched_inputs]
pred_instances = nuscenes_sample_aggregate(
pred_instances,
dummy_group_idxs,
self.num_classes,
poses,
iou_threshold=self.bev_nms_iou_thresh,
include_boxes3d_global=False
)
if self.postprocess_in_inference:
processed_results = []
for results_per_image, input_per_image, image_size in \
zip(pred_instances, batched_inputs, images.image_sizes):
height = input_per_image.get("height", image_size[0])
width = input_per_image.get("width", image_size[1])
r = resize_instances(results_per_image, height, width)
processed_results.append({"instances": r})
# ----------------------------------------------------------
# NuScenes specific: cross-image (i.e. sample-level) BEV NMS.
# ----------------------------------------------------------
sample_tokens = [x['sample_token'] for x in batched_inputs]
group_idxs = get_group_idxs(sample_tokens, self.num_images_per_sample)
instances = [x['instances'] for x in processed_results]
global_poses = [x['pose'] for x in batched_inputs]
filtered_instances = nuscenes_sample_aggregate(
instances,
group_idxs,
self.num_classes,
global_poses,
self.bev_nms_iou_thresh,
max_num_dets_per_sample=self.max_num_dets_per_sample
)
processed_results = [{"instances": x} for x in filtered_instances]
else:
processed_results = [{"instances": x} for x in pred_instances]
return processed_results
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment