Unverified Commit fe25f7a5 authored by Wenwei Zhang's avatar Wenwei Zhang Committed by GitHub
Browse files

Merge pull request #2867 from open-mmlab/dev-1.x

Bump version to 1.4.0
parents 5c0613be 0ef13b83
Pipeline #2710 failed with stages
in 0 seconds
......@@ -275,12 +275,13 @@ class BaseInstance3DBoxes:
Tensor: A binary vector indicating whether each point is inside the
reference range.
"""
in_range_flags = ((self.tensor[:, 0] > box_range[0])
& (self.tensor[:, 1] > box_range[1])
& (self.tensor[:, 2] > box_range[2])
& (self.tensor[:, 0] < box_range[3])
& (self.tensor[:, 1] < box_range[4])
& (self.tensor[:, 2] < box_range[5]))
gravity_center = self.gravity_center
in_range_flags = ((gravity_center[:, 0] > box_range[0])
& (gravity_center[:, 1] > box_range[1])
& (gravity_center[:, 2] > box_range[2])
& (gravity_center[:, 0] < box_range[3])
& (gravity_center[:, 1] < box_range[4])
& (gravity_center[:, 2] < box_range[5]))
return in_range_flags
@abstractmethod
......
# Copyright (c) Open-MMLab. All rights reserved.
__version__ = '1.3.0'
__version__ = '1.4.0'
short_version = __version__
......
......@@ -179,7 +179,10 @@ test_pipeline = [
dict(
type='PointsRangeFilter', point_cloud_range=point_cloud_range)
]),
dict(type='Pack3DDetInputs', keys=['points'])
dict(
type='Pack3DDetInputs',
keys=['points'],
meta_keys=['box_type_3d', 'sample_idx', 'context_name', 'timestamp'])
]
dataset_type = 'WaymoDataset'
......@@ -223,13 +226,7 @@ val_dataloader = dict(
test_dataloader = val_dataloader
val_evaluator = dict(
type='WaymoMetric',
ann_file='./data/waymo/kitti_format/waymo_infos_val.pkl',
waymo_bin_file='./data/waymo/waymo_format/gt.bin',
data_root='./data/waymo/waymo_format',
backend_args=backend_args,
convert_kitti_format=False,
idx2metainfo='./data/waymo/waymo_format/idx2metainfo.pkl')
type='WaymoMetric', waymo_bin_file='./data/waymo/waymo_format/gt.bin')
test_evaluator = val_evaluator
vis_backends = [dict(type='LocalVisBackend')]
......
......@@ -57,17 +57,25 @@ python tools/test.py projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-
### Training commands
The support of training DSVT is on the way.
In MMDetection3D's root directory, run the following command to test the model:
```bash
tools/dist_train.sh projects/DSVT/configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py 8 --sync_bn torch
```
## Results and models
### Waymo
| Middle Encoder | Backbone | Load Interval | Voxel type (voxel size) | Multi-Class NMS | Multi-frames | Mem (GB) | Inf time (fps) | mAP@L1 | mAPH@L1 | mAP@L2 | **mAPH@L2** | Download |
| :------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------: | :-----------: | :---------------------: | :-------------: | :----------: | :------: | :------------: | :----: | :-----: | :----: | :---------: | :------: |
| [DSVT](./configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py) | [ResSECOND](./configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py) | 5 | voxel (0.32) | ✓ | × | | | 75.2 | 72.2 | 68.9 | 66.1 | |
| Middle Encoder | Backbone | Load Interval | Voxel type (voxel size) | Multi-Class NMS | Multi-frames | mAP@L1 | mAPH@L1 | mAP@L2 | **mAPH@L2** | Download |
| :------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------: | :-----------: | :---------------------: | :-------------: | :----------: | :----: | :-----: | :----: | :---------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------: |
| [DSVT](./configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py) | [ResSECOND](./configs/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class.py) | 5 | voxel (0.32) | ✓ | × | 75.5 | 72.4 | 69.2 | 66.3 | [log](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/dsvt/dsvt_voxel032_res-second_secfpn_8xb1-cyclic-12e_waymoD5-3d-3class_20230917_102130.log) |
**Note**:
- `ResSECOND` denotes the base block in SECOND has residual layers.
**Note** that `ResSECOND` denotes the base block in SECOND has residual layers.
- Regrettably, we are unable to provide the pre-trained model weights due to [Waymo Dataset License Agreement](https://waymo.com/open/terms/), so we only provide the training logs as shown above.
## Citation
......
......@@ -88,25 +88,28 @@ model = dict(
loss_cls=dict(
type='mmdet.GaussianFocalLoss', reduction='mean', loss_weight=1.0),
loss_bbox=dict(type='mmdet.L1Loss', reduction='mean', loss_weight=2.0),
loss_iou=dict(type='mmdet.L1Loss', reduction='sum', loss_weight=1.0),
loss_reg_iou=dict(
type='mmdet3d.DIoU3DLoss', reduction='mean', loss_weight=2.0),
norm_bbox=True),
# model training and testing settings
train_cfg=dict(
pts=dict(
grid_size=grid_size,
voxel_size=voxel_size,
out_size_factor=4,
dense_reg=1,
gaussian_overlap=0.1,
max_objs=500,
min_radius=2,
code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])),
grid_size=grid_size,
voxel_size=voxel_size,
point_cloud_range=point_cloud_range,
out_size_factor=1,
dense_reg=1,
gaussian_overlap=0.1,
max_objs=500,
min_radius=2,
code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]),
test_cfg=dict(
max_per_img=500,
max_pool_nms=False,
min_radius=[4, 12, 10, 1, 0.85, 0.175],
iou_rectifier=[[0.68, 0.71, 0.65]],
pc_range=[-80, -80],
out_size_factor=4,
out_size_factor=1,
voxel_size=voxel_size[:2],
nms_type='rotate',
multi_class_nms=True,
......@@ -128,6 +131,8 @@ db_sampler = dict(
coord_type='LIDAR',
load_dim=6,
use_dim=[0, 1, 2, 3, 4],
norm_intensity=True,
norm_elongation=True,
backend_args=backend_args),
backend_args=backend_args)
......@@ -138,25 +143,22 @@ train_pipeline = [
load_dim=6,
use_dim=5,
norm_intensity=True,
norm_elongation=True,
backend_args=backend_args),
# Add this if using `MultiFrameDeformableDecoderRPN`
# dict(
# type='LoadPointsFromMultiSweeps',
# sweeps_num=9,
# load_dim=6,
# use_dim=[0, 1, 2, 3, 4],
# pad_empty_sweeps=True,
# remove_close=True),
dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
dict(type='ObjectSample', db_sampler=db_sampler),
dict(
type='RandomFlip3D',
sync_2d=False,
flip_ratio_bev_horizontal=0.5,
flip_ratio_bev_vertical=0.5),
dict(
type='GlobalRotScaleTrans',
rot_range=[-0.78539816, 0.78539816],
scale_ratio_range=[0.95, 1.05],
translation_std=[0.5, 0.5, 0]),
dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectNameFilter', classes=class_names),
translation_std=[0.5, 0.5, 0.5]),
dict(type='PointsRangeFilter3D', point_cloud_range=point_cloud_range),
dict(type='ObjectRangeFilter3D', point_cloud_range=point_cloud_range),
dict(type='PointShuffle'),
dict(
type='Pack3DDetInputs',
......@@ -172,25 +174,34 @@ test_pipeline = [
norm_intensity=True,
norm_elongation=True,
backend_args=backend_args),
dict(type='PointsRangeFilter3D', point_cloud_range=point_cloud_range),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1333, 800),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='GlobalRotScaleTrans',
rot_range=[0, 0],
scale_ratio_range=[1., 1.],
translation_std=[0, 0, 0]),
dict(type='RandomFlip3D'),
dict(
type='PointsRangeFilter', point_cloud_range=point_cloud_range)
]),
dict(type='Pack3DDetInputs', keys=['points'])
type='Pack3DDetInputs',
keys=['points'],
meta_keys=['box_type_3d', 'sample_idx', 'context_name', 'timestamp'])
]
dataset_type = 'WaymoDataset'
train_dataloader = dict(
batch_size=1,
num_workers=4,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file='waymo_infos_train.pkl',
data_prefix=dict(pts='training/velodyne', sweeps='training/velodyne'),
pipeline=train_pipeline,
modality=input_modality,
test_mode=False,
metainfo=metainfo,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d='LiDAR',
# load one frame every five frames
load_interval=5,
backend_args=backend_args))
val_dataloader = dict(
batch_size=4,
num_workers=4,
......@@ -212,18 +223,59 @@ test_dataloader = val_dataloader
val_evaluator = dict(
type='WaymoMetric',
ann_file='./data/waymo/kitti_format/waymo_infos_val.pkl',
waymo_bin_file='./data/waymo/waymo_format/gt.bin',
data_root='./data/waymo/waymo_format',
backend_args=backend_args,
convert_kitti_format=False,
idx2metainfo='./data/waymo/waymo_format/idx2metainfo.pkl')
result_prefix='./dsvt_pred')
test_evaluator = val_evaluator
vis_backends = [dict(type='LocalVisBackend')]
visualizer = dict(
type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
# schedules
lr = 1e-5
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(type='AdamW', lr=lr, weight_decay=0.05, betas=(0.9, 0.99)),
clip_grad=dict(max_norm=10, norm_type=2))
param_scheduler = [
dict(
type='CosineAnnealingLR',
T_max=1.2,
eta_min=lr * 100,
begin=0,
end=1.2,
by_epoch=True,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
T_max=10.8,
eta_min=lr * 1e-4,
begin=1.2,
end=12,
by_epoch=True,
convert_to_iter_based=True),
# momentum scheduler
dict(
type='CosineAnnealingMomentum',
T_max=1.2,
eta_min=0.85,
begin=0,
end=1.2,
by_epoch=True,
convert_to_iter_based=True),
dict(
type='CosineAnnealingMomentum',
T_max=10.8,
eta_min=0.95,
begin=1.2,
end=12,
by_epoch=True,
convert_to_iter_based=True)
]
# runtime settings
train_cfg = dict(by_epoch=True, max_epochs=12, val_interval=1)
# runtime settings
val_cfg = dict()
test_cfg = dict()
......@@ -236,4 +288,12 @@ test_cfg = dict()
default_hooks = dict(
logger=dict(type='LoggerHook', interval=50),
checkpoint=dict(type='CheckpointHook', interval=5))
checkpoint=dict(type='CheckpointHook', interval=1))
custom_hooks = [
dict(
type='DisableAugHook',
disable_after_epoch=11,
disable_aug_list=[
'GlobalRotScaleTrans', 'RandomFlip3D', 'ObjectSample'
])
]
from .disable_aug_hook import DisableAugHook
from .dsvt import DSVT
from .dsvt_head import DSVTCenterHead
from .dsvt_transformer import DSVTMiddleEncoder
from .dynamic_pillar_vfe import DynamicPillarVFE3D
from .map2bev import PointPillarsScatter3D
from .res_second import ResSECOND
from .transforms_3d import ObjectRangeFilter3D, PointsRangeFilter3D
from .utils import DSVTBBoxCoder
__all__ = [
'DSVTCenterHead', 'DSVT', 'DSVTMiddleEncoder', 'DynamicPillarVFE3D',
'PointPillarsScatter3D', 'ResSECOND', 'DSVTBBoxCoder'
'PointPillarsScatter3D', 'ResSECOND', 'DSVTBBoxCoder',
'ObjectRangeFilter3D', 'PointsRangeFilter3D', 'DisableAugHook'
]
# Copyright (c) OpenMMLab. All rights reserved.
from typing import List
from mmengine.dataset import BaseDataset
from mmengine.hooks import Hook
from mmengine.model import is_model_wrapper
from mmengine.runner import Runner
from mmdet3d.registry import HOOKS
@HOOKS.register_module()
class DisableAugHook(Hook):
"""The hook of disabling augmentations during training.
Args:
disable_after_epoch (int): The number of epochs after which
the data augmentation will be closed in the training.
Defaults to 15.
disable_aug_list (list): the list of data augmentation will
be closed in the training. Defaults to [].
"""
def __init__(self,
disable_after_epoch: int = 15,
disable_aug_list: List = []):
self.disable_after_epoch = disable_after_epoch
self.disable_aug_list = disable_aug_list
self._restart_dataloader = False
def before_train_epoch(self, runner: Runner):
"""Close augmentation.
Args:
runner (Runner): The runner.
"""
epoch = runner.epoch
train_loader = runner.train_dataloader
model = runner.model
# TODO: refactor after mmengine using model wrapper
if is_model_wrapper(model):
model = model.module
if epoch == self.disable_after_epoch:
dataset = runner.train_dataloader.dataset
# handle dataset wrapper
if not isinstance(dataset, BaseDataset):
dataset = dataset.dataset
new_transforms = []
for transform in dataset.pipeline.transforms: # noqa: E501
if transform.__class__.__name__ not in self.disable_aug_list:
new_transforms.append(transform)
else:
runner.logger.info(
f'Disable {transform.__class__.__name__}')
dataset.pipeline.transforms = new_transforms
# The dataset pipeline cannot be updated when persistent_workers
# is True, so we need to force the dataloader's multi-process
# restart. This is a very hacky approach.
if hasattr(train_loader, 'persistent_workers'
) and train_loader.persistent_workers is True:
train_loader._DataLoader__initialized = False
train_loader._iterator = None
self._restart_dataloader = True
else:
# Once the restart is complete, we need to restore
# the initialization flag.
if self._restart_dataloader:
train_loader._DataLoader__initialized = True
......@@ -103,7 +103,11 @@ class DSVT(Base3DDetector):
Returns:
dict[str, Tensor]: A dictionary of loss components.
"""
pass
pts_feats = self.extract_feat(batch_inputs_dict)
losses = dict()
loss = self.bbox_head.loss(pts_feats, batch_data_samples)
losses.update(loss)
return losses
def predict(self, batch_inputs_dict: Dict[str, Optional[Tensor]],
batch_data_samples: List[Det3DDataSample],
......
import math
from typing import Dict, List, Tuple
import torch
import torch.nn as nn
from mmcv.ops import boxes_iou3d
from mmdet.models.utils import multi_apply
from mmengine.model import kaiming_init
from mmengine.structures import InstanceData
from torch import Tensor
from torch.nn.init import constant_
from mmdet3d.models import CenterHead
from mmdet3d.models.layers import circle_nms, nms_bev
from mmdet3d.models.utils import (clip_sigmoid, draw_heatmap_gaussian,
gaussian_radius)
from mmdet3d.registry import MODELS
from mmdet3d.structures import Det3DDataSample, xywhr2xyxyr
......@@ -18,8 +25,33 @@ class DSVTCenterHead(CenterHead):
This head adds IoU prediction branch based on the original CenterHead.
"""
def __init__(self, *args, **kwargs):
def __init__(self,
loss_iou=dict(
type='mmdet.L1Loss', reduction='mean', loss_weight=1),
loss_reg_iou=None,
*args,
**kwargs):
super(DSVTCenterHead, self).__init__(*args, **kwargs)
self.loss_iou = MODELS.build(loss_iou)
self.loss_iou_reg = MODELS.build(
loss_reg_iou) if loss_reg_iou is not None else None
def init_weights(self):
kaiming_init(
self.shared_conv.conv,
a=math.sqrt(5),
mode='fan_in',
nonlinearity='leaky_relu',
distribution='uniform')
for head in self.task_heads[0].heads:
if head == 'heatmap':
constant_(self.task_heads[0].__getattr__(head)[-1].bias,
self.task_heads[0].init_bias)
else:
for m in self.task_heads[0].__getattr__(head).modules():
if isinstance(m, nn.Conv2d):
kaiming_init(
m, mode='fan_in', nonlinearity='leaky_relu')
def forward_single(self, x: Tensor) -> dict:
"""Forward function for CenterPoint.
......@@ -66,7 +98,298 @@ class DSVTCenterHead(CenterHead):
Returns:
dict: Losses of each branch.
"""
pass
outs = self(pts_feats)
batch_gt_instance_3d = []
for data_sample in batch_data_samples:
batch_gt_instance_3d.append(data_sample.gt_instances_3d)
losses = self.loss_by_feat(outs, batch_gt_instance_3d)
return losses
def _decode_all_preds(self,
pred_dict,
point_cloud_range=None,
voxel_size=None):
batch_size, _, H, W = pred_dict['reg'].shape
batch_center = pred_dict['reg'].permute(0, 2, 3, 1).contiguous().view(
batch_size, H * W, 2) # (B, H, W, 2)
batch_center_z = pred_dict['height'].permute(
0, 2, 3, 1).contiguous().view(batch_size, H * W, 1) # (B, H, W, 1)
batch_dim = pred_dict['dim'].exp().permute(
0, 2, 3, 1).contiguous().view(batch_size, H * W, 3) # (B, H, W, 3)
batch_rot_cos = pred_dict['rot'][:, 0].unsqueeze(dim=1).permute(
0, 2, 3, 1).contiguous().view(batch_size, H * W, 1) # (B, H, W, 1)
batch_rot_sin = pred_dict['rot'][:, 1].unsqueeze(dim=1).permute(
0, 2, 3, 1).contiguous().view(batch_size, H * W, 1) # (B, H, W, 1)
batch_vel = pred_dict['vel'].permute(0, 2, 3, 1).contiguous().view(
batch_size, H * W, 2) if 'vel' in pred_dict.keys() else None
angle = torch.atan2(batch_rot_sin, batch_rot_cos) # (B, H*W, 1)
ys, xs = torch.meshgrid([
torch.arange(
0, H, device=batch_center.device, dtype=batch_center.dtype),
torch.arange(
0, W, device=batch_center.device, dtype=batch_center.dtype)
])
ys = ys.view(1, H, W).repeat(batch_size, 1, 1)
xs = xs.view(1, H, W).repeat(batch_size, 1, 1)
xs = xs.view(batch_size, -1, 1) + batch_center[:, :, 0:1]
ys = ys.view(batch_size, -1, 1) + batch_center[:, :, 1:2]
xs = xs * voxel_size[0] + point_cloud_range[0]
ys = ys * voxel_size[1] + point_cloud_range[1]
box_part_list = [xs, ys, batch_center_z, batch_dim, angle]
if batch_vel is not None:
box_part_list.append(batch_vel)
box_preds = torch.cat((box_part_list),
dim=-1).view(batch_size, H, W, -1)
return box_preds
def _transpose_and_gather_feat(self, feat, ind):
feat = feat.permute(0, 2, 3, 1).contiguous()
feat = feat.view(feat.size(0), -1, feat.size(3))
feat = self._gather_feat(feat, ind)
return feat
def calc_iou_loss(self, iou_preds, batch_box_preds, mask, ind, gt_boxes):
"""
Args:
iou_preds: (batch x 1 x h x w)
batch_box_preds: (batch x (7 or 9) x h x w)
mask: (batch x max_objects)
ind: (batch x max_objects)
gt_boxes: List of batch groundtruth boxes.
Returns:
Tensor: IoU Loss.
"""
if mask.sum() == 0:
return iou_preds.new_zeros((1))
mask = mask.bool()
selected_iou_preds = self._transpose_and_gather_feat(iou_preds,
ind)[mask]
selected_box_preds = self._transpose_and_gather_feat(
batch_box_preds, ind)[mask]
gt_boxes = torch.cat(gt_boxes)
assert gt_boxes.size(0) == selected_box_preds.size(0)
iou_target = boxes_iou3d(selected_box_preds[:, 0:7], gt_boxes[:, 0:7])
iou_target = torch.diag(iou_target).view(-1)
iou_target = iou_target * 2 - 1 # [0, 1] ==> [-1, 1]
loss = self.loss_iou(selected_iou_preds.view(-1), iou_target)
loss = loss / torch.clamp(mask.sum(), min=1e-4)
return loss
def calc_iou_reg_loss(self, batch_box_preds, mask, ind, gt_boxes):
if mask.sum() == 0:
return batch_box_preds.new_zeros((1))
mask = mask.bool()
selected_box_preds = self._transpose_and_gather_feat(
batch_box_preds, ind)[mask]
gt_boxes = torch.cat(gt_boxes)
assert gt_boxes.size(0) == selected_box_preds.size(0)
loss = self.loss_iou_reg(selected_box_preds[:, 0:7], gt_boxes[:, 0:7])
return loss
def get_targets(
self,
batch_gt_instances_3d: List[InstanceData],
) -> Tuple[List[Tensor]]:
"""Generate targets.
How each output is transformed:
Each nested list is transposed so that all same-index elements in
each sub-list (1, ..., N) become the new sub-lists.
[ [a0, a1, a2, ... ], [b0, b1, b2, ... ], ... ]
==> [ [a0, b0, ... ], [a1, b1, ... ], [a2, b2, ... ] ]
The new transposed nested list is converted into a list of N
tensors generated by concatenating tensors in the new sub-lists.
[ tensor0, tensor1, tensor2, ... ]
Args:
batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
gt_instances. It usually includes ``bboxes_3d`` and\
``labels_3d`` attributes.
Returns:
Returns:
tuple[list[torch.Tensor]]: Tuple of target including
the following results in order.
- list[torch.Tensor]: Heatmap scores.
- list[torch.Tensor]: Ground truth boxes.
- list[torch.Tensor]: Indexes indicating the
position of the valid boxes.
- list[torch.Tensor]: Masks indicating which
boxes are valid.
"""
heatmaps, anno_boxes, inds, masks, task_gt_bboxes = multi_apply(
self.get_targets_single, batch_gt_instances_3d)
# Transpose heatmaps
heatmaps = list(map(list, zip(*heatmaps)))
heatmaps = [torch.stack(hms_) for hms_ in heatmaps]
# Transpose anno_boxes
anno_boxes = list(map(list, zip(*anno_boxes)))
anno_boxes = [torch.stack(anno_boxes_) for anno_boxes_ in anno_boxes]
# Transpose inds
inds = list(map(list, zip(*inds)))
inds = [torch.stack(inds_) for inds_ in inds]
# Transpose masks
masks = list(map(list, zip(*masks)))
masks = [torch.stack(masks_) for masks_ in masks]
# Transpose task_gt_bboxes
task_gt_bboxes = list(map(list, zip(*task_gt_bboxes)))
return heatmaps, anno_boxes, inds, masks, task_gt_bboxes
def get_targets_single(self,
gt_instances_3d: InstanceData) -> Tuple[Tensor]:
"""Generate training targets for a single sample.
Args:
gt_instances_3d (:obj:`InstanceData`): Gt_instances_3d of
single data sample. It usually includes
``bboxes_3d`` and ``labels_3d`` attributes.
Returns:
tuple[list[torch.Tensor]]: Tuple of target including
the following results in order.
- list[torch.Tensor]: Heatmap scores.
- list[torch.Tensor]: Ground truth boxes.
- list[torch.Tensor]: Indexes indicating the position
of the valid boxes.
- list[torch.Tensor]: Masks indicating which boxes
are valid.
"""
gt_labels_3d = gt_instances_3d.labels_3d
gt_bboxes_3d = gt_instances_3d.bboxes_3d
device = gt_labels_3d.device
gt_bboxes_3d = torch.cat(
(gt_bboxes_3d.gravity_center, gt_bboxes_3d.tensor[:, 3:]),
dim=1).to(device)
max_objs = self.train_cfg['max_objs'] * self.train_cfg['dense_reg']
grid_size = torch.tensor(self.train_cfg['grid_size']).to(device)
pc_range = torch.tensor(self.train_cfg['point_cloud_range'])
voxel_size = torch.tensor(self.train_cfg['voxel_size'])
feature_map_size = grid_size[:2] // self.train_cfg['out_size_factor']
# reorganize the gt_dict by tasks
task_masks = []
flag = 0
for class_name in self.class_names:
task_masks.append([
torch.where(gt_labels_3d == class_name.index(i) + flag)
for i in class_name
])
flag += len(class_name)
task_boxes = []
task_classes = []
flag2 = 0
for idx, mask in enumerate(task_masks):
task_box = []
task_class = []
for m in mask:
task_box.append(gt_bboxes_3d[m])
# 0 is background for each task, so we need to add 1 here.
task_class.append(gt_labels_3d[m] + 1 - flag2)
task_boxes.append(torch.cat(task_box, axis=0).to(device))
task_classes.append(torch.cat(task_class).long().to(device))
flag2 += len(mask)
draw_gaussian = draw_heatmap_gaussian
heatmaps, anno_boxes, inds, masks = [], [], [], []
for idx, task_head in enumerate(self.task_heads):
heatmap = gt_bboxes_3d.new_zeros(
(len(self.class_names[idx]), feature_map_size[1],
feature_map_size[0]))
anno_box = gt_bboxes_3d.new_zeros((max_objs, 8),
dtype=torch.float32)
ind = gt_labels_3d.new_zeros((max_objs), dtype=torch.int64)
mask = gt_bboxes_3d.new_zeros((max_objs), dtype=torch.uint8)
num_objs = min(task_boxes[idx].shape[0], max_objs)
for k in range(num_objs):
cls_id = task_classes[idx][k] - 1
length = task_boxes[idx][k][3]
width = task_boxes[idx][k][4]
length = length / voxel_size[0] / self.train_cfg[
'out_size_factor']
width = width / voxel_size[1] / self.train_cfg[
'out_size_factor']
if width > 0 and length > 0:
radius = gaussian_radius(
(width, length),
min_overlap=self.train_cfg['gaussian_overlap'])
radius = max(self.train_cfg['min_radius'], int(radius))
# be really careful for the coordinate system of
# your box annotation.
x, y, z = task_boxes[idx][k][0], task_boxes[idx][k][
1], task_boxes[idx][k][2]
coor_x = (
x - pc_range[0]
) / voxel_size[0] / self.train_cfg['out_size_factor']
coor_y = (
y - pc_range[1]
) / voxel_size[1] / self.train_cfg['out_size_factor']
center = torch.tensor([coor_x, coor_y],
dtype=torch.float32,
device=device)
center_int = center.to(torch.int32)
# throw out not in range objects to avoid out of array
# area when creating the heatmap
if not (0 <= center_int[0] < feature_map_size[0]
and 0 <= center_int[1] < feature_map_size[1]):
continue
draw_gaussian(heatmap[cls_id], center_int, radius)
new_idx = k
x, y = center_int[0], center_int[1]
assert (y * feature_map_size[0] + x <
feature_map_size[0] * feature_map_size[1])
ind[new_idx] = y * feature_map_size[0] + x
mask[new_idx] = 1
# TODO: support other outdoor dataset
rot = task_boxes[idx][k][6]
box_dim = task_boxes[idx][k][3:6]
if self.norm_bbox:
box_dim = box_dim.log()
anno_box[new_idx] = torch.cat([
center - torch.tensor([x, y], device=device),
z.unsqueeze(0), box_dim,
torch.cos(rot).unsqueeze(0),
torch.sin(rot).unsqueeze(0)
])
heatmaps.append(heatmap)
anno_boxes.append(anno_box)
masks.append(mask)
inds.append(ind)
return heatmaps, anno_boxes, inds, masks, task_boxes
def loss_by_feat(self, preds_dicts: Tuple[List[dict]],
batch_gt_instances_3d: List[InstanceData], *args,
......@@ -79,13 +402,72 @@ class DSVTCenterHead(CenterHead):
tasks head, and the internal list indicate different
FPN level.
batch_gt_instances_3d (list[:obj:`InstanceData`]): Batch of
gt_instances. It usually includes ``bboxes_3d`` and\
gt_instances_3d. It usually includes ``bboxes_3d`` and
``labels_3d`` attributes.
Returns:
dict[str,torch.Tensor]: Loss of heatmap and bbox of each task.
"""
pass
heatmaps, anno_boxes, inds, masks, task_gt_bboxes = self.get_targets(
batch_gt_instances_3d)
loss_dict = dict()
for task_id, preds_dict in enumerate(preds_dicts):
# heatmap focal loss
preds_dict[0]['heatmap'] = clip_sigmoid(preds_dict[0]['heatmap'])
num_pos = heatmaps[task_id].eq(1).float().sum().item()
loss_heatmap = self.loss_cls(
preds_dict[0]['heatmap'],
heatmaps[task_id],
avg_factor=max(num_pos, 1))
target_box = anno_boxes[task_id]
# reconstruct the anno_box from multiple reg heads
preds_dict[0]['anno_box'] = torch.cat(
(preds_dict[0]['reg'], preds_dict[0]['height'],
preds_dict[0]['dim'], preds_dict[0]['rot']),
dim=1)
# Regression loss for dimension, offset, height, rotation
ind = inds[task_id]
num = masks[task_id].float().sum()
pred = preds_dict[0]['anno_box'].permute(0, 2, 3, 1).contiguous()
pred = pred.view(pred.size(0), -1, pred.size(3))
pred = self._gather_feat(pred, ind)
mask = masks[task_id].unsqueeze(2).expand_as(target_box).float()
isnotnan = (~torch.isnan(target_box)).float()
mask *= isnotnan
code_weights = self.train_cfg.get('code_weights', None)
bbox_weights = mask * mask.new_tensor(code_weights)
loss_bbox = self.loss_bbox(
pred, target_box, bbox_weights, avg_factor=(num + 1e-4))
loss_dict[f'task{task_id}.loss_heatmap'] = loss_heatmap
loss_dict[f'task{task_id}.loss_bbox'] = loss_bbox
if 'iou' in preds_dict[0]:
batch_box_preds = self._decode_all_preds(
pred_dict=preds_dict[0],
point_cloud_range=self.train_cfg['point_cloud_range'],
voxel_size=self.train_cfg['voxel_size']
) # (B, H, W, 7 or 9)
batch_box_preds_for_iou = batch_box_preds.permute(
0, 3, 1, 2) # (B, 7 or 9, H, W)
loss_dict[f'task{task_id}.loss_iou'] = self.calc_iou_loss(
iou_preds=preds_dict[0]['iou'],
batch_box_preds=batch_box_preds_for_iou.clone().detach(),
mask=masks[task_id],
ind=ind,
gt_boxes=task_gt_bboxes[task_id])
if self.loss_iou_reg is not None:
loss_dict[f'task{task_id}.loss_reg_iou'] = \
self.calc_iou_reg_loss(
batch_box_preds=batch_box_preds_for_iou,
mask=masks[task_id],
ind=ind,
gt_boxes=task_gt_bboxes[task_id])
return loss_dict
def predict(self,
pts_feats: Tuple[torch.Tensor],
......@@ -158,6 +540,7 @@ class DSVTCenterHead(CenterHead):
else:
batch_dim = preds_dict[0]['dim']
# It's different from CenterHead
batch_rotc = preds_dict[0]['rot'][:, 0].unsqueeze(1)
batch_rots = preds_dict[0]['rot'][:, 1].unsqueeze(1)
batch_iou = (preds_dict[0]['iou'] +
......
# modified from https://github.com/Haiyang-W/DSVT
import numpy as np
import torch
import torch.nn as nn
import torch_scatter
......@@ -76,6 +77,7 @@ class DynamicPillarVFE3D(nn.Module):
self.voxel_x = voxel_size[0]
self.voxel_y = voxel_size[1]
self.voxel_z = voxel_size[2]
point_cloud_range = np.array(point_cloud_range).astype(np.float32)
self.x_offset = self.voxel_x / 2 + point_cloud_range[0]
self.y_offset = self.voxel_y / 2 + point_cloud_range[1]
self.z_offset = self.voxel_z / 2 + point_cloud_range[2]
......
# modified from https://github.com/Haiyang-W/DSVT
import warnings
from typing import Optional, Sequence, Tuple
from typing import Sequence, Tuple
from mmengine.model import BaseModule
from torch import Tensor
......@@ -78,8 +76,8 @@ class ResSECOND(BaseModule):
out_channels (list[int]): Output channels for multi-scale feature maps.
blocks_nums (list[int]): Number of blocks in each stage.
layer_strides (list[int]): Strides of each stage.
norm_cfg (dict): Config dict of normalization layers.
conv_cfg (dict): Config dict of convolutional layers.
init_cfg (dict, optional): Config for weight initialization.
Defaults to None.
"""
def __init__(self,
......@@ -87,8 +85,7 @@ class ResSECOND(BaseModule):
out_channels: Sequence[int] = [128, 128, 256],
blocks_nums: Sequence[int] = [1, 2, 2],
layer_strides: Sequence[int] = [2, 2, 2],
init_cfg: OptMultiConfig = None,
pretrained: Optional[str] = None) -> None:
init_cfg: OptMultiConfig = None) -> None:
super(ResSECOND, self).__init__(init_cfg=init_cfg)
assert len(layer_strides) == len(blocks_nums)
assert len(out_channels) == len(blocks_nums)
......@@ -108,14 +105,6 @@ class ResSECOND(BaseModule):
BasicResBlock(out_channels[i], out_channels[i]))
blocks.append(nn.Sequential(*cur_layers))
self.blocks = nn.Sequential(*blocks)
assert not (init_cfg and pretrained), \
'init_cfg and pretrained cannot be setting at the same time'
if isinstance(pretrained, str):
warnings.warn('DeprecationWarning: pretrained is a deprecated, '
'please use "init_cfg" instead')
self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
else:
self.init_cfg = dict(type='Kaiming', layer='Conv2d')
def forward(self, x: Tensor) -> Tuple[Tensor, ...]:
"""Forward function.
......
from typing import List
import numpy as np
from mmcv import BaseTransform
from mmdet3d.registry import TRANSFORMS
@TRANSFORMS.register_module()
class ObjectRangeFilter3D(BaseTransform):
"""Filter objects by the range. It differs from `ObjectRangeFilter` by
using `in_range_3d` instead of `in_range_bev`.
Required Keys:
- gt_bboxes_3d
Modified Keys:
- gt_bboxes_3d
Args:
point_cloud_range (list[float]): Point cloud range.
"""
def __init__(self, point_cloud_range: List[float]) -> None:
self.pcd_range = np.array(point_cloud_range, dtype=np.float32)
def transform(self, input_dict: dict) -> dict:
"""Transform function to filter objects by the range.
Args:
input_dict (dict): Result dict from loading pipeline.
Returns:
dict: Results after filtering, 'gt_bboxes_3d', 'gt_labels_3d'
keys are updated in the result dict.
"""
gt_bboxes_3d = input_dict['gt_bboxes_3d']
gt_labels_3d = input_dict['gt_labels_3d']
mask = gt_bboxes_3d.in_range_3d(self.pcd_range)
gt_bboxes_3d = gt_bboxes_3d[mask]
# mask is a torch tensor but gt_labels_3d is still numpy array
# using mask to index gt_labels_3d will cause bug when
# len(gt_labels_3d) == 1, where mask=1 will be interpreted
# as gt_labels_3d[1] and cause out of index error
gt_labels_3d = gt_labels_3d[mask.numpy().astype(bool)]
# limit rad to [-pi, pi]
gt_bboxes_3d.limit_yaw(offset=0.5, period=2 * np.pi)
input_dict['gt_bboxes_3d'] = gt_bboxes_3d
input_dict['gt_labels_3d'] = gt_labels_3d
return input_dict
def __repr__(self) -> str:
"""str: Return a string that describes the module."""
repr_str = self.__class__.__name__
repr_str += f'(point_cloud_range={self.pcd_range.tolist()})'
return repr_str
@TRANSFORMS.register_module()
class PointsRangeFilter3D(BaseTransform):
"""Filter points by the range. It differs from `PointRangeFilter` by using
`in_range_bev` instead of `in_range_3d`.
Required Keys:
- points
- pts_instance_mask (optional)
Modified Keys:
- points
- pts_instance_mask (optional)
Args:
point_cloud_range (list[float]): Point cloud range.
"""
def __init__(self, point_cloud_range: List[float]) -> None:
self.pcd_range = np.array(point_cloud_range, dtype=np.float32)
def transform(self, input_dict: dict) -> dict:
"""Transform function to filter points by the range.
Args:
input_dict (dict): Result dict from loading pipeline.
Returns:
dict: Results after filtering, 'points', 'pts_instance_mask'
and 'pts_semantic_mask' keys are updated in the result dict.
"""
points = input_dict['points']
points_mask = points.in_range_bev(self.pcd_range[[0, 1, 3, 4]])
clean_points = points[points_mask]
input_dict['points'] = clean_points
points_mask = points_mask.numpy()
pts_instance_mask = input_dict.get('pts_instance_mask', None)
pts_semantic_mask = input_dict.get('pts_semantic_mask', None)
if pts_instance_mask is not None:
input_dict['pts_instance_mask'] = pts_instance_mask[points_mask]
if pts_semantic_mask is not None:
input_dict['pts_semantic_mask'] = pts_semantic_mask[points_mask]
return input_dict
def __repr__(self) -> str:
"""str: Return a string that describes the module."""
repr_str = self.__class__.__name__
repr_str += f'(point_cloud_range={self.pcd_range.tolist()})'
return repr_str
......@@ -3,10 +3,11 @@ from typing import Dict, List, Optional
import numpy as np
import torch
import torch.nn as nn
from mmdet.models.losses.utils import weighted_loss
from torch import Tensor
from mmdet3d.models.task_modules import CenterPointBBoxCoder
from mmdet3d.registry import TASK_UTILS
from mmdet3d.registry import MODELS, TASK_UTILS
from .ops.ingroup_inds.ingroup_inds_op import ingroup_inds
get_inner_win_inds_cuda = ingroup_inds
......@@ -266,7 +267,7 @@ class DSVTBBoxCoder(CenterPointBBoxCoder):
thresh_mask = final_scores > self.score_threshold
if self.post_center_range is not None:
self.post_center_range = torch.tensor(
self.post_center_range = torch.as_tensor(
self.post_center_range, device=heat.device)
mask = (final_box_preds[..., :3] >=
self.post_center_range[:3]).all(2)
......@@ -298,3 +299,142 @@ class DSVTBBoxCoder(CenterPointBBoxCoder):
'support post_center_range is not None for now!')
return predictions_dicts
def center_to_corner2d(center, dim):
corners_norm = torch.tensor(
[[-0.5, -0.5], [-0.5, 0.5], [0.5, 0.5], [0.5, -0.5]],
device=dim.device).type_as(center) # (4, 2)
corners = dim.view([-1, 1, 2]) * corners_norm.view([1, 4, 2]) # (N, 4, 2)
corners = corners + center.view(-1, 1, 2)
return corners
@weighted_loss
def diou3d_loss(pred_boxes, gt_boxes, eps: float = 1e-7):
"""
modified from https://github.com/agent-sgs/PillarNet/blob/master/det3d/core/utils/center_utils.py # noqa
Args:
pred_boxes (N, 7):
gt_boxes (N, 7):
Returns:
Tensor: Distance-IoU Loss.
"""
assert pred_boxes.shape[0] == gt_boxes.shape[0]
qcorners = center_to_corner2d(pred_boxes[:, :2],
pred_boxes[:, 3:5]) # (N, 4, 2)
gcorners = center_to_corner2d(gt_boxes[:, :2], gt_boxes[:,
3:5]) # (N, 4, 2)
inter_max_xy = torch.minimum(qcorners[:, 2], gcorners[:, 2])
inter_min_xy = torch.maximum(qcorners[:, 0], gcorners[:, 0])
out_max_xy = torch.maximum(qcorners[:, 2], gcorners[:, 2])
out_min_xy = torch.minimum(qcorners[:, 0], gcorners[:, 0])
# calculate area
volume_pred_boxes = pred_boxes[:, 3] * pred_boxes[:, 4] * pred_boxes[:, 5]
volume_gt_boxes = gt_boxes[:, 3] * gt_boxes[:, 4] * gt_boxes[:, 5]
inter_h = torch.minimum(
pred_boxes[:, 2] + 0.5 * pred_boxes[:, 5],
gt_boxes[:, 2] + 0.5 * gt_boxes[:, 5]) - torch.maximum(
pred_boxes[:, 2] - 0.5 * pred_boxes[:, 5],
gt_boxes[:, 2] - 0.5 * gt_boxes[:, 5])
inter_h = torch.clamp(inter_h, min=0)
inter = torch.clamp((inter_max_xy - inter_min_xy), min=0)
volume_inter = inter[:, 0] * inter[:, 1] * inter_h
volume_union = volume_gt_boxes + volume_pred_boxes - volume_inter + eps
# boxes_iou3d_gpu(pred_boxes, gt_boxes)
inter_diag = torch.pow(gt_boxes[:, 0:3] - pred_boxes[:, 0:3], 2).sum(-1)
outer_h = torch.maximum(
gt_boxes[:, 2] + 0.5 * gt_boxes[:, 5],
pred_boxes[:, 2] + 0.5 * pred_boxes[:, 5]) - torch.minimum(
gt_boxes[:, 2] - 0.5 * gt_boxes[:, 5],
pred_boxes[:, 2] - 0.5 * pred_boxes[:, 5])
outer_h = torch.clamp(outer_h, min=0)
outer = torch.clamp((out_max_xy - out_min_xy), min=0)
outer_diag = outer[:, 0]**2 + outer[:, 1]**2 + outer_h**2 + eps
dious = volume_inter / volume_union - inter_diag / outer_diag
dious = torch.clamp(dious, min=-1.0, max=1.0)
loss = 1 - dious
return loss
@MODELS.register_module()
class DIoU3DLoss(nn.Module):
r"""3D bboxes Implementation of `Distance-IoU Loss: Faster and Better
Learning for Bounding Box Regression <https://arxiv.org/abs/1911.08287>`_.
Code is modified from https://github.com/Zzh-tju/DIoU.
Args:
eps (float): Epsilon to avoid log(0). Defaults to 1e-6.
reduction (str): Options are "none", "mean" and "sum".
Defaults to "mean".
loss_weight (float): Weight of loss. Defaults to 1.0.
"""
def __init__(self,
eps: float = 1e-6,
reduction: str = 'mean',
loss_weight: float = 1.0) -> None:
super().__init__()
self.eps = eps
self.reduction = reduction
self.loss_weight = loss_weight
def forward(self,
pred: Tensor,
target: Tensor,
weight: Optional[Tensor] = None,
avg_factor: Optional[int] = None,
reduction_override: Optional[str] = None,
**kwargs) -> Tensor:
"""Forward function.
Args:
pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
shape (n, 4).
target (Tensor): The learning target of the prediction,
shape (n, 4).
weight (Optional[Tensor], optional): The weight of loss for each
prediction. Defaults to None.
avg_factor (Optional[int], optional): Average factor that is used
to average the loss. Defaults to None.
reduction_override (Optional[str], optional): The reduction method
used to override the original reduction method of the loss.
Defaults to None. Options are "none", "mean" and "sum".
Returns:
Tensor: Loss tensor.
"""
if weight is not None and not torch.any(weight > 0):
if pred.dim() == weight.dim() + 1:
weight = weight.unsqueeze(1)
return (pred * weight).sum() # 0
assert reduction_override in (None, 'none', 'mean', 'sum')
reduction = (
reduction_override if reduction_override else self.reduction)
if weight is not None and weight.dim() > 1:
# TODO: remove this in the future
# reduce the weight of shape (n, 4) to (n,) to match the
# giou_loss of shape (n,)
assert weight.shape == pred.shape
weight = weight.mean(-1)
loss = self.loss_weight * diou3d_loss(
pred,
target,
weight,
eps=self.eps,
reduction=reduction,
avg_factor=avg_factor,
**kwargs)
return loss
# NeRF-Det: Learning Geometry-Aware Volumetric Representation for Multi-View 3D Object Detection
> [NeRF-Det: Learning Geometry-Aware Volumetric Representation for Multi-View 3D Object Detection](https://arxiv.org/abs/2307.14620)
<!-- [ALGORITHM] -->
## Abstract
NeRF-Det is a novel method for indoor 3D detection with posed RGB images as input. Unlike existing indoor 3D detection methods that struggle to model scene geometry, NeRF-Det makes novel use of NeRF in an end-to-end manner to explicitly estimate 3D geometry, thereby improving 3D detection performance. Specifically, to avoid the significant extra latency associated with per-scene optimization of NeRF, NeRF-Det introduce sufficient geometry priors to enhance the generalizability of NeRF-MLP. Furthermore, it subtly connect the detection and NeRF branches through a shared MLP, enabling an efficient adaptation of NeRF to detection and yielding geometry-aware volumetric representations for 3D detection. NeRF-Det outperforms state-of-the-arts by 3.9 mAP and 3.1 mAP on the ScanNet and ARKITScenes benchmarks, respectively. The author provide extensive analysis to shed light on how NeRF-Det works. As a result of joint-training design, NeRF-Det is able to generalize well to unseen scenes for object detection, view synthesis, and depth estimation tasks without requiring per-scene optimization. Code will be available at https://github.com/facebookresearch/NeRF-Det
<div align=center>
<img src="https://chenfengxu714.github.io/nerfdet/static/images/method-cropped_1.png" width="800"/>
</div>
## Introduction
This directory contains the implementations of NeRF-Det (https://arxiv.org/abs/2307.14620). Our implementations are built on top of MMdetection3D. We have updated NeRF-Det to be compatible with latest mmdet3d version. The codebase and config files have all changed to adapt to the new mmdet3d version. All previous pretrained models are verified with the result listed below. However, newly trained models are yet to be uploaded.
<!-- Share any information you would like others to know. For example:
Author: @xxx.
This is an implementation of \[XXX\]. -->
## Dataset
The format of the scannet dataset in the latest version of mmdet3d only supports the lidar tasks. For NeRF-Det, we need to create the new format of ScanNet Dataset.
Please following the files in mmdet3d to prepare the raw data of ScanNet. After that, please use this command to generate the pkls used in nerfdet.
```bash
python projects/NeRF-Det/prepare_infos.py --root-path ./data/scannet --out-dir ./data/scannet
```
The new format of the pkl is organized as below:
- scannet_infos_train.pkl: The train data infos, the detailed info of each scan is as follows:
- info\['instances'\]:A list of dict contains all annotations, each dict contains all annotation information of single instance.For the i-th instance:
- info\['instances'\]\[i\]\['bbox_3d'\]: List of 6 numbers representing the axis_aligned in depth coordinate system, in (x,y,z,l,w,h) order.
- info\['instances'\]\[i\]\['bbox_label_3d'\]: The label of each 3d bounding boxes.
- info\['cam2img'\]: The intrinsic matrix.Every scene has one matrix.
- info\['lidar2cam'\]: The extrinsic matrixes.Every scene has 300 matrixes.
- info\['img_paths'\]: The paths of the 300 rgb pictures.
- info\['axis_align_matrix'\]: The align matrix.Every scene has one matrix.
After preparing your scannet dataset pkls,please change the paths in configs to fit your project.
## Train
In MMDet3D's root directory, run the following command to train the model:
```bash
python tools/train.py projects/NeRF-Det/configs/nerfdet_res50_2x_low_res.py ${WORK_DIR}
```
## Results and Models
### NeRF-Det
| Backbone | mAP@25 | mAP@50 | Log |
| :-------------------------------------------------------------: | :----: | :----: | :-------: |
| [NeRF-Det-R50](./configs/nerfdet_res50_2x_low_res.py) | 53.0 | 26.8 | [log](<>) |
| [NeRF-Det-R50\*](./configs/nerfdet_res50_2x_low_res_depth.py) | 52.2 | 28.5 | [log](<>) |
| [NeRF-Det-R101\*](./configs/nerfdet_res101_2x_low_res_depth.py) | 52.3 | 28.5 | [log](<>) |
(Here NeRF-Det-R50\* means this model uses depth information in the training step)
### Notes
- The values showed in the chart all represents the best mAP in the training.
- Since there is a lot of randomness in the behavior of the model, we conducted three experiments on each config and took the average. The mAP showed on the above chart are all average values.
- We also conducted the same experiments in the original code, the results are showed below.
| Backbone | mAP@25 | mAP@50 |
| :-------------: | :----: | :----: |
| NeRF-Det-R50 | 52.8 | 26.8 |
| NeRF-Det-R50\* | 52.4 | 27.5 |
| NeRF-Det-R101\* | 52.8 | 28.6 |
- Attention: Because of the randomness in the construction of the ScanNet dataset itself and the behavior of the model, the training results will fluctuate considerably. According to experimental results and experience, the experimental results will fluctuate by plus or minus 1.5 points.
## Evaluation using pretrained models
1. Download the pretrained checkpoints through the linkings in the above chart.
2. Testing
To test, use:
```bash
python tools/test.py projects/NeRF-Det/configs/nerfdet_res50_2x_low_res.py ${CHECKPOINT_PATH}
```
## Citation
<!-- You may remove this section if not applicable. -->
```latex
@inproceedings{
xu2023nerfdet,
title={NeRF-Det: Learning Geometry-Aware Volumetric Representation for Multi-View 3D Object Detection},
author={Xu, Chenfeng and Wu, Bichen and Hou, Ji and Tsai, Sam and Li, Ruilong and Wang, Jialiang and Zhan, Wei and He, Zijian and Vajda, Peter and Keutzer, Kurt and Tomizuka, Masayoshi},
booktitle={ICCV},
year={2023},
}
@inproceedings{
park2023time,
title={Time Will Tell: New Outlooks and A Baseline for Temporal Multi-View 3D Object Detection},
author={Jinhyung Park and Chenfeng Xu and Shijia Yang and Kurt Keutzer and Kris M. Kitani and Masayoshi Tomizuka and Wei Zhan},
booktitle={The Eleventh International Conference on Learning Representations },
year={2023},
url={https://openreview.net/forum?id=H3HcEJA2Um}
}
```
_base_ = ['../../../configs/_base_/default_runtime.py']
custom_imports = dict(imports=['projects.NeRF-Det.nerfdet'])
prior_generator = dict(
type='AlignedAnchor3DRangeGenerator',
ranges=[[-3.2, -3.2, -1.28, 3.2, 3.2, 1.28]],
rotations=[.0])
model = dict(
type='NerfDet',
data_preprocessor=dict(
type='NeRFDetDataPreprocessor',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=True,
pad_size_divisor=10),
backbone=dict(
type='mmdet.ResNet',
depth=101,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=1,
norm_cfg=dict(type='BN', requires_grad=False),
norm_eval=True,
init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'),
style='pytorch'),
neck=dict(
type='mmdet.FPN',
in_channels=[256, 512, 1024, 2048],
out_channels=256,
num_outs=4),
neck_3d=dict(
type='IndoorImVoxelNeck',
in_channels=256,
out_channels=128,
n_blocks=[1, 1, 1]),
bbox_head=dict(
type='NerfDetHead',
bbox_loss=dict(type='AxisAlignedIoULoss', loss_weight=1.0),
n_classes=18,
n_levels=3,
n_channels=128,
n_reg_outs=6,
pts_assign_threshold=27,
pts_center_threshold=18,
prior_generator=prior_generator),
prior_generator=prior_generator,
voxel_size=[.16, .16, .2],
n_voxels=[40, 40, 16],
aabb=([-2.7, -2.7, -0.78], [3.7, 3.7, 1.78]),
near_far_range=[0.2, 8.0],
N_samples=64,
N_rand=2048,
nerf_mode='image',
depth_supervise=True,
use_nerf_mask=True,
nerf_sample_view=20,
squeeze_scale=4,
nerf_density=True,
train_cfg=dict(),
test_cfg=dict(nms_pre=1000, iou_thr=.25, score_thr=.01))
dataset_type = 'MultiViewScanNetDataset'
data_root = 'data/scannet/'
class_names = [
'cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window', 'bookshelf',
'picture', 'counter', 'desk', 'curtain', 'refrigerator', 'showercurtrain',
'toilet', 'sink', 'bathtub', 'garbagebin'
]
metainfo = dict(CLASSES=class_names)
file_client_args = dict(backend='disk')
input_modality = dict(
use_camera=True,
use_depth=True,
use_lidar=False,
use_neuralrecon_depth=False,
use_ray=True)
backend_args = None
train_collect_keys = [
'img', 'gt_bboxes_3d', 'gt_labels_3d', 'depth', 'lightpos', 'nerf_sizes',
'raydirs', 'gt_images', 'gt_depths', 'denorm_images'
]
test_collect_keys = [
'img',
'depth',
'lightpos',
'nerf_sizes',
'raydirs',
'gt_images',
'gt_depths',
'denorm_images',
]
train_pipeline = [
dict(type='LoadAnnotations3D'),
dict(
type='MultiViewPipeline',
n_images=48,
transforms=[
dict(type='LoadImageFromFile', file_client_args=file_client_args),
dict(type='Resize', scale=(320, 240), keep_ratio=True),
],
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
margin=10,
depth_range=[0.5, 5.5],
loading='random',
nerf_target_views=10),
dict(type='RandomShiftOrigin', std=(.7, .7, .0)),
dict(type='PackNeRFDetInputs', keys=train_collect_keys)
]
test_pipeline = [
dict(type='LoadAnnotations3D'),
dict(
type='MultiViewPipeline',
n_images=101,
transforms=[
dict(type='LoadImageFromFile', file_client_args=file_client_args),
dict(type='Resize', scale=(320, 240), keep_ratio=True),
],
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
margin=10,
depth_range=[0.5, 5.5],
loading='random',
nerf_target_views=1),
dict(type='PackNeRFDetInputs', keys=test_collect_keys)
]
train_dataloader = dict(
batch_size=1,
num_workers=1,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
type='RepeatDataset',
times=6,
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file='scannet_infos_train_new.pkl',
pipeline=train_pipeline,
modality=input_modality,
test_mode=False,
filter_empty_gt=True,
box_type_3d='Depth',
metainfo=metainfo)))
val_dataloader = dict(
batch_size=1,
num_workers=5,
persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False),
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file='scannet_infos_val_new.pkl',
pipeline=test_pipeline,
modality=input_modality,
test_mode=True,
filter_empty_gt=True,
box_type_3d='Depth',
metainfo=metainfo))
test_dataloader = val_dataloader
val_evaluator = dict(type='IndoorMetric')
test_evaluator = val_evaluator
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=1)
test_cfg = dict()
val_cfg = dict()
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.0001),
paramwise_cfg=dict(
custom_keys={'backbone': dict(lr_mult=0.1, decay_mult=1.0)}),
clip_grad=dict(max_norm=35., norm_type=2))
param_scheduler = [
dict(
type='MultiStepLR',
begin=0,
end=12,
by_epoch=True,
milestones=[8, 11],
gamma=0.1)
]
# hooks
default_hooks = dict(
checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=12))
# runtime
find_unused_parameters = True # only 1 of 4 FPN outputs is used
_base_ = ['./nerfdet_res50_2x_low_res_depth.py']
model = dict(depth_supervise=False)
dataset_type = 'MultiViewScanNetDataset'
data_root = 'data/scannet/'
class_names = [
'cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window', 'bookshelf',
'picture', 'counter', 'desk', 'curtain', 'refrigerator', 'showercurtrain',
'toilet', 'sink', 'bathtub', 'garbagebin'
]
metainfo = dict(CLASSES=class_names)
file_client_args = dict(backend='disk')
input_modality = dict(use_depth=False)
backend_args = None
train_collect_keys = [
'img', 'gt_bboxes_3d', 'gt_labels_3d', 'lightpos', 'nerf_sizes', 'raydirs',
'gt_images', 'gt_depths', 'denorm_images'
]
test_collect_keys = [
'img',
'lightpos',
'nerf_sizes',
'raydirs',
'gt_images',
'gt_depths',
'denorm_images',
]
train_pipeline = [
dict(type='LoadAnnotations3D'),
dict(
type='MultiViewPipeline',
n_images=50,
transforms=[
dict(type='LoadImageFromFile', file_client_args=file_client_args),
dict(type='Resize', scale=(320, 240), keep_ratio=True),
],
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
margin=10,
depth_range=[0.5, 5.5],
loading='random',
nerf_target_views=10),
dict(type='RandomShiftOrigin', std=(.7, .7, .0)),
dict(type='PackNeRFDetInputs', keys=train_collect_keys)
]
test_pipeline = [
dict(type='LoadAnnotations3D'),
dict(
type='MultiViewPipeline',
n_images=101,
transforms=[
dict(type='LoadImageFromFile', file_client_args=file_client_args),
dict(type='Resize', scale=(320, 240), keep_ratio=True),
],
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
margin=10,
depth_range=[0.5, 5.5],
loading='random',
nerf_target_views=1),
dict(type='PackNeRFDetInputs', keys=test_collect_keys)
]
train_dataloader = dict(
batch_size=1,
num_workers=1,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
type='RepeatDataset',
times=6,
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file='scannet_infos_train_new.pkl',
pipeline=train_pipeline,
modality=input_modality,
test_mode=False,
filter_empty_gt=True,
box_type_3d='Depth',
metainfo=metainfo)))
val_dataloader = dict(
batch_size=1,
num_workers=1,
persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False),
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file='scannet_infos_val_new.pkl',
pipeline=test_pipeline,
modality=input_modality,
test_mode=True,
filter_empty_gt=True,
box_type_3d='Depth',
metainfo=metainfo))
test_dataloader = val_dataloader
_base_ = ['../../../configs/_base_/default_runtime.py']
custom_imports = dict(imports=['projects.NeRF-Det.nerfdet'])
prior_generator = dict(
type='AlignedAnchor3DRangeGenerator',
ranges=[[-3.2, -3.2, -1.28, 3.2, 3.2, 1.28]],
rotations=[.0])
model = dict(
type='NerfDet',
data_preprocessor=dict(
type='NeRFDetDataPreprocessor',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=True,
pad_size_divisor=10),
backbone=dict(
type='mmdet.ResNet',
depth=50,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=1,
norm_cfg=dict(type='BN', requires_grad=False),
norm_eval=True,
init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
style='pytorch'),
neck=dict(
type='mmdet.FPN',
in_channels=[256, 512, 1024, 2048],
out_channels=256,
num_outs=4),
neck_3d=dict(
type='IndoorImVoxelNeck',
in_channels=256,
out_channels=128,
n_blocks=[1, 1, 1]),
bbox_head=dict(
type='NerfDetHead',
bbox_loss=dict(type='AxisAlignedIoULoss', loss_weight=1.0),
n_classes=18,
n_levels=3,
n_channels=128,
n_reg_outs=6,
pts_assign_threshold=27,
pts_center_threshold=18,
prior_generator=prior_generator),
prior_generator=prior_generator,
voxel_size=[.16, .16, .2],
n_voxels=[40, 40, 16],
aabb=([-2.7, -2.7, -0.78], [3.7, 3.7, 1.78]),
near_far_range=[0.2, 8.0],
N_samples=64,
N_rand=2048,
nerf_mode='image',
depth_supervise=True,
use_nerf_mask=True,
nerf_sample_view=20,
squeeze_scale=4,
nerf_density=True,
train_cfg=dict(),
test_cfg=dict(nms_pre=1000, iou_thr=.25, score_thr=.01))
dataset_type = 'MultiViewScanNetDataset'
data_root = 'data/scannet/'
class_names = [
'cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window', 'bookshelf',
'picture', 'counter', 'desk', 'curtain', 'refrigerator', 'showercurtrain',
'toilet', 'sink', 'bathtub', 'garbagebin'
]
metainfo = dict(CLASSES=class_names)
file_client_args = dict(backend='disk')
input_modality = dict(
use_camera=True,
use_depth=True,
use_lidar=False,
use_neuralrecon_depth=False,
use_ray=True)
backend_args = None
train_collect_keys = [
'img', 'gt_bboxes_3d', 'gt_labels_3d', 'depth', 'lightpos', 'nerf_sizes',
'raydirs', 'gt_images', 'gt_depths', 'denorm_images'
]
test_collect_keys = [
'img',
'depth',
'lightpos',
'nerf_sizes',
'raydirs',
'gt_images',
'gt_depths',
'denorm_images',
]
train_pipeline = [
dict(type='LoadAnnotations3D'),
dict(
type='MultiViewPipeline',
n_images=50,
transforms=[
dict(type='LoadImageFromFile', file_client_args=file_client_args),
dict(type='Resize', scale=(320, 240), keep_ratio=True),
],
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
margin=10,
depth_range=[0.5, 5.5],
loading='random',
nerf_target_views=10),
dict(type='RandomShiftOrigin', std=(.7, .7, .0)),
dict(type='PackNeRFDetInputs', keys=train_collect_keys)
]
test_pipeline = [
dict(type='LoadAnnotations3D'),
dict(
type='MultiViewPipeline',
n_images=101,
transforms=[
dict(type='LoadImageFromFile', file_client_args=file_client_args),
dict(type='Resize', scale=(320, 240), keep_ratio=True),
],
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
margin=10,
depth_range=[0.5, 5.5],
loading='random',
nerf_target_views=1),
dict(type='PackNeRFDetInputs', keys=test_collect_keys)
]
train_dataloader = dict(
batch_size=1,
num_workers=1,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
type='RepeatDataset',
times=6,
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file='scannet_infos_train_new.pkl',
pipeline=train_pipeline,
modality=input_modality,
test_mode=False,
filter_empty_gt=True,
box_type_3d='Depth',
metainfo=metainfo)))
val_dataloader = dict(
batch_size=1,
num_workers=5,
persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False),
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file='scannet_infos_val_new.pkl',
pipeline=test_pipeline,
modality=input_modality,
test_mode=True,
filter_empty_gt=True,
box_type_3d='Depth',
metainfo=metainfo))
test_dataloader = val_dataloader
val_evaluator = dict(type='IndoorMetric')
test_evaluator = val_evaluator
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=12, val_interval=1)
test_cfg = dict()
val_cfg = dict()
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.0001),
paramwise_cfg=dict(
custom_keys={'backbone': dict(lr_mult=0.1, decay_mult=1.0)}),
clip_grad=dict(max_norm=35., norm_type=2))
param_scheduler = [
dict(
type='MultiStepLR',
begin=0,
end=12,
by_epoch=True,
milestones=[8, 11],
gamma=0.1)
]
# hooks
default_hooks = dict(
checkpoint=dict(type='CheckpointHook', interval=1, max_keep_ckpts=12))
# runtime
find_unused_parameters = True # only 1 of 4 FPN outputs is used
from .data_preprocessor import NeRFDetDataPreprocessor
from .formating import PackNeRFDetInputs
from .multiview_pipeline import MultiViewPipeline, RandomShiftOrigin
from .nerfdet import NerfDet
from .nerfdet_head import NerfDetHead
from .scannet_multiview_dataset import MultiViewScanNetDataset
__all__ = [
'MultiViewScanNetDataset', 'MultiViewPipeline', 'RandomShiftOrigin',
'PackNeRFDetInputs', 'NeRFDetDataPreprocessor', 'NerfDetHead', 'NerfDet'
]
# Copyright (c) OpenMMLab. All rights reserved.
import math
from numbers import Number
from typing import Dict, List, Optional, Sequence, Tuple, Union
import numpy as np
import torch
from mmdet.models import DetDataPreprocessor
from mmdet.models.utils.misc import samplelist_boxtype2tensor
from mmengine.model import stack_batch
from mmengine.utils import is_seq_of
from torch import Tensor
from torch.nn import functional as F
from mmdet3d.models.data_preprocessors.utils import multiview_img_stack_batch
from mmdet3d.models.data_preprocessors.voxelize import (
VoxelizationByGridShape, dynamic_scatter_3d)
from mmdet3d.registry import MODELS
from mmdet3d.structures.det3d_data_sample import SampleList
from mmdet3d.utils import OptConfigType
@MODELS.register_module()
class NeRFDetDataPreprocessor(DetDataPreprocessor):
"""In NeRF-Det, some extra information is needed in NeRF branch. We put the
datapreprocessor operations of these new information such as stack and pack
operations in this class. You can find the stack operations in subfuction
'collate_data' and the pack operations in 'simple_process'. Other codes are
the same as the default class 'DetDataPreprocessor'.
Points / Image pre-processor for point clouds / vision-only / multi-
modality 3D detection tasks.
It provides the data pre-processing as follows
- Collate and move image and point cloud data to the target device.
- 1) For image data:
- Pad images in inputs to the maximum size of current batch with defined
``pad_value``. The padding size can be divisible by a defined
``pad_size_divisor``.
- Stack images in inputs to batch_imgs.
- Convert images in inputs from bgr to rgb if the shape of input is
(3, H, W).
- Normalize images in inputs with defined std and mean.
- Do batch augmentations during training.
- 2) For point cloud data:
- If no voxelization, directly return list of point cloud data.
- If voxelization is applied, voxelize point cloud according to
``voxel_type`` and obtain ``voxels``.
Args:
voxel (bool): Whether to apply voxelization to point cloud.
Defaults to False.
voxel_type (str): Voxelization type. Two voxelization types are
provided: 'hard' and 'dynamic', respectively for hard voxelization
and dynamic voxelization. Defaults to 'hard'.
voxel_layer (dict or :obj:`ConfigDict`, optional): Voxelization layer
config. Defaults to None.
batch_first (bool): Whether to put the batch dimension to the first
dimension when getting voxel coordinates. Defaults to True.
max_voxels (int, optional): Maximum number of voxels in each voxel
grid. Defaults to None.
mean (Sequence[Number], optional): The pixel mean of R, G, B channels.
Defaults to None.
std (Sequence[Number], optional): The pixel standard deviation of
R, G, B channels. Defaults to None.
pad_size_divisor (int): The size of padded image should be divisible by
``pad_size_divisor``. Defaults to 1.
pad_value (float or int): The padded pixel value. Defaults to 0.
pad_mask (bool): Whether to pad instance masks. Defaults to False.
mask_pad_value (int): The padded pixel value for instance masks.
Defaults to 0.
pad_seg (bool): Whether to pad semantic segmentation maps.
Defaults to False.
seg_pad_value (int): The padded pixel value for semantic segmentation
maps. Defaults to 255.
bgr_to_rgb (bool): Whether to convert image from BGR to RGB.
Defaults to False.
rgb_to_bgr (bool): Whether to convert image from RGB to BGR.
Defaults to False.
boxtype2tensor (bool): Whether to convert the ``BaseBoxes`` type of
bboxes data to ``Tensor`` type. Defaults to True.
non_blocking (bool): Whether to block current process when transferring
data to device. Defaults to False.
batch_augments (List[dict], optional): Batch-level augmentations.
Defaults to None.
"""
def __init__(self,
voxel: bool = False,
voxel_type: str = 'hard',
voxel_layer: OptConfigType = None,
batch_first: bool = True,
max_voxels: Optional[int] = None,
mean: Sequence[Number] = None,
std: Sequence[Number] = None,
pad_size_divisor: int = 1,
pad_value: Union[float, int] = 0,
pad_mask: bool = False,
mask_pad_value: int = 0,
pad_seg: bool = False,
seg_pad_value: int = 255,
bgr_to_rgb: bool = False,
rgb_to_bgr: bool = False,
boxtype2tensor: bool = True,
non_blocking: bool = False,
batch_augments: Optional[List[dict]] = None) -> None:
super(NeRFDetDataPreprocessor, self).__init__(
mean=mean,
std=std,
pad_size_divisor=pad_size_divisor,
pad_value=pad_value,
pad_mask=pad_mask,
mask_pad_value=mask_pad_value,
pad_seg=pad_seg,
seg_pad_value=seg_pad_value,
bgr_to_rgb=bgr_to_rgb,
rgb_to_bgr=rgb_to_bgr,
boxtype2tensor=boxtype2tensor,
non_blocking=non_blocking,
batch_augments=batch_augments)
self.voxel = voxel
self.voxel_type = voxel_type
self.batch_first = batch_first
self.max_voxels = max_voxels
if voxel:
self.voxel_layer = VoxelizationByGridShape(**voxel_layer)
def forward(self,
data: Union[dict, List[dict]],
training: bool = False) -> Union[dict, List[dict]]:
"""Perform normalization, padding and bgr2rgb conversion based on
``BaseDataPreprocessor``.
Args:
data (dict or List[dict]): Data from dataloader. The dict contains
the whole batch data, when it is a list[dict], the list
indicates test time augmentation.
training (bool): Whether to enable training time augmentation.
Defaults to False.
Returns:
dict or List[dict]: Data in the same format as the model input.
"""
if isinstance(data, list):
num_augs = len(data)
aug_batch_data = []
for aug_id in range(num_augs):
single_aug_batch_data = self.simple_process(
data[aug_id], training)
aug_batch_data.append(single_aug_batch_data)
return aug_batch_data
else:
return self.simple_process(data, training)
def simple_process(self, data: dict, training: bool = False) -> dict:
"""Perform normalization, padding and bgr2rgb conversion for img data
based on ``BaseDataPreprocessor``, and voxelize point cloud if `voxel`
is set to be True.
Args:
data (dict): Data sampled from dataloader.
training (bool): Whether to enable training time augmentation.
Defaults to False.
Returns:
dict: Data in the same format as the model input.
"""
if 'img' in data['inputs']:
batch_pad_shape = self._get_pad_shape(data)
data = self.collate_data(data)
inputs, data_samples = data['inputs'], data['data_samples']
batch_inputs = dict()
if 'points' in inputs:
batch_inputs['points'] = inputs['points']
if self.voxel:
voxel_dict = self.voxelize(inputs['points'], data_samples)
batch_inputs['voxels'] = voxel_dict
if 'imgs' in inputs:
imgs = inputs['imgs']
if data_samples is not None:
# NOTE the batched image size information may be useful, e.g.
# in DETR, this is needed for the construction of masks, which
# is then used for the transformer_head.
batch_input_shape = tuple(imgs[0].size()[-2:])
for data_sample, pad_shape in zip(data_samples,
batch_pad_shape):
data_sample.set_metainfo({
'batch_input_shape': batch_input_shape,
'pad_shape': pad_shape
})
if self.boxtype2tensor:
samplelist_boxtype2tensor(data_samples)
if self.pad_mask:
self.pad_gt_masks(data_samples)
if self.pad_seg:
self.pad_gt_sem_seg(data_samples)
if training and self.batch_augments is not None:
for batch_aug in self.batch_augments:
imgs, data_samples = batch_aug(imgs, data_samples)
batch_inputs['imgs'] = imgs
# Hard code here, will be changed later.
# if len(inputs['depth']) != 0:
if 'depth' in inputs.keys():
batch_inputs['depth'] = inputs['depth']
batch_inputs['lightpos'] = inputs['lightpos']
batch_inputs['nerf_sizes'] = inputs['nerf_sizes']
batch_inputs['denorm_images'] = inputs['denorm_images']
batch_inputs['raydirs'] = inputs['raydirs']
return {'inputs': batch_inputs, 'data_samples': data_samples}
def preprocess_img(self, _batch_img: Tensor) -> Tensor:
# channel transform
if self._channel_conversion:
_batch_img = _batch_img[[2, 1, 0], ...]
# Convert to float after channel conversion to ensure
# efficiency
_batch_img = _batch_img.float()
# Normalization.
if self._enable_normalize:
if self.mean.shape[0] == 3:
assert _batch_img.dim() == 3 and _batch_img.shape[0] == 3, (
'If the mean has 3 values, the input tensor '
'should in shape of (3, H, W), but got the '
f'tensor with shape {_batch_img.shape}')
_batch_img = (_batch_img - self.mean) / self.std
return _batch_img
def collate_data(self, data: dict) -> dict:
"""Copy data to the target device and perform normalization, padding
and bgr2rgb conversion and stack based on ``BaseDataPreprocessor``.
Collates the data sampled from dataloader into a list of dict and list
of labels, and then copies tensor to the target device.
Args:
data (dict): Data sampled from dataloader.
Returns:
dict: Data in the same format as the model input.
"""
data = self.cast_data(data) # type: ignore
if 'img' in data['inputs']:
_batch_imgs = data['inputs']['img']
# Process data with `pseudo_collate`.
if is_seq_of(_batch_imgs, torch.Tensor):
batch_imgs = []
img_dim = _batch_imgs[0].dim()
for _batch_img in _batch_imgs:
if img_dim == 3: # standard img
_batch_img = self.preprocess_img(_batch_img)
elif img_dim == 4:
_batch_img = [
self.preprocess_img(_img) for _img in _batch_img
]
_batch_img = torch.stack(_batch_img, dim=0)
batch_imgs.append(_batch_img)
# Pad and stack Tensor.
if img_dim == 3:
batch_imgs = stack_batch(batch_imgs, self.pad_size_divisor,
self.pad_value)
elif img_dim == 4:
batch_imgs = multiview_img_stack_batch(
batch_imgs, self.pad_size_divisor, self.pad_value)
# Process data with `default_collate`.
elif isinstance(_batch_imgs, torch.Tensor):
assert _batch_imgs.dim() == 4, (
'The input of `ImgDataPreprocessor` should be a NCHW '
'tensor or a list of tensor, but got a tensor with '
f'shape: {_batch_imgs.shape}')
if self._channel_conversion:
_batch_imgs = _batch_imgs[:, [2, 1, 0], ...]
# Convert to float after channel conversion to ensure
# efficiency
_batch_imgs = _batch_imgs.float()
if self._enable_normalize:
_batch_imgs = (_batch_imgs - self.mean) / self.std
h, w = _batch_imgs.shape[2:]
target_h = math.ceil(
h / self.pad_size_divisor) * self.pad_size_divisor
target_w = math.ceil(
w / self.pad_size_divisor) * self.pad_size_divisor
pad_h = target_h - h
pad_w = target_w - w
batch_imgs = F.pad(_batch_imgs, (0, pad_w, 0, pad_h),
'constant', self.pad_value)
else:
raise TypeError(
'Output of `cast_data` should be a list of dict '
'or a tuple with inputs and data_samples, but got '
f'{type(data)}: {data}')
data['inputs']['imgs'] = batch_imgs
if 'raydirs' in data['inputs']:
_batch_dirs = data['inputs']['raydirs']
batch_dirs = stack_batch(_batch_dirs)
data['inputs']['raydirs'] = batch_dirs
if 'lightpos' in data['inputs']:
_batch_poses = data['inputs']['lightpos']
batch_poses = stack_batch(_batch_poses)
data['inputs']['lightpos'] = batch_poses
if 'denorm_images' in data['inputs']:
_batch_denorm_imgs = data['inputs']['denorm_images']
# Process data with `pseudo_collate`.
if is_seq_of(_batch_denorm_imgs, torch.Tensor):
denorm_img_dim = _batch_denorm_imgs[0].dim()
# Pad and stack Tensor.
if denorm_img_dim == 3:
batch_denorm_imgs = stack_batch(_batch_denorm_imgs,
self.pad_size_divisor,
self.pad_value)
elif denorm_img_dim == 4:
batch_denorm_imgs = multiview_img_stack_batch(
_batch_denorm_imgs, self.pad_size_divisor,
self.pad_value)
data['inputs']['denorm_images'] = batch_denorm_imgs
data.setdefault('data_samples', None)
return data
def _get_pad_shape(self, data: dict) -> List[Tuple[int, int]]:
"""Get the pad_shape of each image based on data and
pad_size_divisor."""
# rewrite `_get_pad_shape` for obtaining image inputs.
_batch_inputs = data['inputs']['img']
# Process data with `pseudo_collate`.
if is_seq_of(_batch_inputs, torch.Tensor):
batch_pad_shape = []
for ori_input in _batch_inputs:
if ori_input.dim() == 4:
# mean multiview input, select one of the
# image to calculate the pad shape
ori_input = ori_input[0]
pad_h = int(
np.ceil(ori_input.shape[1] /
self.pad_size_divisor)) * self.pad_size_divisor
pad_w = int(
np.ceil(ori_input.shape[2] /
self.pad_size_divisor)) * self.pad_size_divisor
batch_pad_shape.append((pad_h, pad_w))
# Process data with `default_collate`.
elif isinstance(_batch_inputs, torch.Tensor):
assert _batch_inputs.dim() == 4, (
'The input of `ImgDataPreprocessor` should be a NCHW tensor '
'or a list of tensor, but got a tensor with shape: '
f'{_batch_inputs.shape}')
pad_h = int(
np.ceil(_batch_inputs.shape[1] /
self.pad_size_divisor)) * self.pad_size_divisor
pad_w = int(
np.ceil(_batch_inputs.shape[2] /
self.pad_size_divisor)) * self.pad_size_divisor
batch_pad_shape = [(pad_h, pad_w)] * _batch_inputs.shape[0]
else:
raise TypeError('Output of `cast_data` should be a list of dict '
'or a tuple with inputs and data_samples, but got '
f'{type(data)}: {data}')
return batch_pad_shape
@torch.no_grad()
def voxelize(self, points: List[Tensor],
data_samples: SampleList) -> Dict[str, Tensor]:
"""Apply voxelization to point cloud.
Args:
points (List[Tensor]): Point cloud in one data batch.
data_samples: (list[:obj:`NeRFDet3DDataSample`]): The annotation
data of every samples. Add voxel-wise annotation for
segmentation.
Returns:
Dict[str, Tensor]: Voxelization information.
- voxels (Tensor): Features of voxels, shape is MxNxC for hard
voxelization, NxC for dynamic voxelization.
- coors (Tensor): Coordinates of voxels, shape is Nx(1+NDim),
where 1 represents the batch index.
- num_points (Tensor, optional): Number of points in each voxel.
- voxel_centers (Tensor, optional): Centers of voxels.
"""
voxel_dict = dict()
if self.voxel_type == 'hard':
voxels, coors, num_points, voxel_centers = [], [], [], []
for i, res in enumerate(points):
res_voxels, res_coors, res_num_points = self.voxel_layer(res)
res_voxel_centers = (
res_coors[:, [2, 1, 0]] + 0.5) * res_voxels.new_tensor(
self.voxel_layer.voxel_size) + res_voxels.new_tensor(
self.voxel_layer.point_cloud_range[0:3])
res_coors = F.pad(res_coors, (1, 0), mode='constant', value=i)
voxels.append(res_voxels)
coors.append(res_coors)
num_points.append(res_num_points)
voxel_centers.append(res_voxel_centers)
voxels = torch.cat(voxels, dim=0)
coors = torch.cat(coors, dim=0)
num_points = torch.cat(num_points, dim=0)
voxel_centers = torch.cat(voxel_centers, dim=0)
voxel_dict['num_points'] = num_points
voxel_dict['voxel_centers'] = voxel_centers
elif self.voxel_type == 'dynamic':
coors = []
# dynamic voxelization only provide a coors mapping
for i, res in enumerate(points):
res_coors = self.voxel_layer(res)
res_coors = F.pad(res_coors, (1, 0), mode='constant', value=i)
coors.append(res_coors)
voxels = torch.cat(points, dim=0)
coors = torch.cat(coors, dim=0)
elif self.voxel_type == 'cylindrical':
voxels, coors = [], []
for i, (res, data_sample) in enumerate(zip(points, data_samples)):
rho = torch.sqrt(res[:, 0]**2 + res[:, 1]**2)
phi = torch.atan2(res[:, 1], res[:, 0])
polar_res = torch.stack((rho, phi, res[:, 2]), dim=-1)
min_bound = polar_res.new_tensor(
self.voxel_layer.point_cloud_range[:3])
max_bound = polar_res.new_tensor(
self.voxel_layer.point_cloud_range[3:])
try: # only support PyTorch >= 1.9.0
polar_res_clamp = torch.clamp(polar_res, min_bound,
max_bound)
except TypeError:
polar_res_clamp = polar_res.clone()
for coor_idx in range(3):
polar_res_clamp[:, coor_idx][
polar_res[:, coor_idx] >
max_bound[coor_idx]] = max_bound[coor_idx]
polar_res_clamp[:, coor_idx][
polar_res[:, coor_idx] <
min_bound[coor_idx]] = min_bound[coor_idx]
res_coors = torch.floor(
(polar_res_clamp - min_bound) / polar_res_clamp.new_tensor(
self.voxel_layer.voxel_size)).int()
self.get_voxel_seg(res_coors, data_sample)
res_coors = F.pad(res_coors, (1, 0), mode='constant', value=i)
res_voxels = torch.cat((polar_res, res[:, :2], res[:, 3:]),
dim=-1)
voxels.append(res_voxels)
coors.append(res_coors)
voxels = torch.cat(voxels, dim=0)
coors = torch.cat(coors, dim=0)
elif self.voxel_type == 'minkunet':
voxels, coors = [], []
voxel_size = points[0].new_tensor(self.voxel_layer.voxel_size)
for i, (res, data_sample) in enumerate(zip(points, data_samples)):
res_coors = torch.round(res[:, :3] / voxel_size).int()
res_coors -= res_coors.min(0)[0]
res_coors_numpy = res_coors.cpu().numpy()
inds, point2voxel_map = self.sparse_quantize(
res_coors_numpy, return_index=True, return_inverse=True)
point2voxel_map = torch.from_numpy(point2voxel_map).cuda()
if self.training and self.max_voxels is not None:
if len(inds) > self.max_voxels:
inds = np.random.choice(
inds, self.max_voxels, replace=False)
inds = torch.from_numpy(inds).cuda()
if hasattr(data_sample.gt_pts_seg, 'pts_semantic_mask'):
data_sample.gt_pts_seg.voxel_semantic_mask \
= data_sample.gt_pts_seg.pts_semantic_mask[inds]
res_voxel_coors = res_coors[inds]
res_voxels = res[inds]
if self.batch_first:
res_voxel_coors = F.pad(
res_voxel_coors, (1, 0), mode='constant', value=i)
data_sample.batch_idx = res_voxel_coors[:, 0]
else:
res_voxel_coors = F.pad(
res_voxel_coors, (0, 1), mode='constant', value=i)
data_sample.batch_idx = res_voxel_coors[:, -1]
data_sample.point2voxel_map = point2voxel_map.long()
voxels.append(res_voxels)
coors.append(res_voxel_coors)
voxels = torch.cat(voxels, dim=0)
coors = torch.cat(coors, dim=0)
else:
raise ValueError(f'Invalid voxelization type {self.voxel_type}')
voxel_dict['voxels'] = voxels
voxel_dict['coors'] = coors
return voxel_dict
def get_voxel_seg(self, res_coors: Tensor,
data_sample: SampleList) -> None:
"""Get voxel-wise segmentation label and point2voxel map.
Args:
res_coors (Tensor): The voxel coordinates of points, Nx3.
data_sample: (:obj:`NeRFDet3DDataSample`): The annotation data of
every samples. Add voxel-wise annotation forsegmentation.
"""
if self.training:
pts_semantic_mask = data_sample.gt_pts_seg.pts_semantic_mask
voxel_semantic_mask, _, point2voxel_map = dynamic_scatter_3d(
F.one_hot(pts_semantic_mask.long()).float(), res_coors, 'mean',
True)
voxel_semantic_mask = torch.argmax(voxel_semantic_mask, dim=-1)
data_sample.gt_pts_seg.voxel_semantic_mask = voxel_semantic_mask
data_sample.point2voxel_map = point2voxel_map
else:
pseudo_tensor = res_coors.new_ones([res_coors.shape[0], 1]).float()
_, _, point2voxel_map = dynamic_scatter_3d(pseudo_tensor,
res_coors, 'mean', True)
data_sample.point2voxel_map = point2voxel_map
def ravel_hash(self, x: np.ndarray) -> np.ndarray:
"""Get voxel coordinates hash for np.unique.
Args:
x (np.ndarray): The voxel coordinates of points, Nx3.
Returns:
np.ndarray: Voxels coordinates hash.
"""
assert x.ndim == 2, x.shape
x = x - np.min(x, axis=0)
x = x.astype(np.uint64, copy=False)
xmax = np.max(x, axis=0).astype(np.uint64) + 1
h = np.zeros(x.shape[0], dtype=np.uint64)
for k in range(x.shape[1] - 1):
h += x[:, k]
h *= xmax[k + 1]
h += x[:, -1]
return h
def sparse_quantize(self,
coords: np.ndarray,
return_index: bool = False,
return_inverse: bool = False) -> List[np.ndarray]:
"""Sparse Quantization for voxel coordinates used in Minkunet.
Args:
coords (np.ndarray): The voxel coordinates of points, Nx3.
return_index (bool): Whether to return the indices of the unique
coords, shape (M,).
return_inverse (bool): Whether to return the indices of the
original coords, shape (N,).
Returns:
List[np.ndarray]: Return index and inverse map if return_index and
return_inverse is True.
"""
_, indices, inverse_indices = np.unique(
self.ravel_hash(coords), return_index=True, return_inverse=True)
coords = coords[indices]
outputs = []
if return_index:
outputs += [indices]
if return_inverse:
outputs += [inverse_indices]
return outputs
# Copyright (c) OpenMMLab. All rights reserved.
from typing import List, Sequence, Union
import mmengine
import numpy as np
import torch
from mmcv import BaseTransform
from mmengine.structures import InstanceData
from numpy import dtype
from mmdet3d.registry import TRANSFORMS
from mmdet3d.structures import BaseInstance3DBoxes, PointData
from mmdet3d.structures.points import BasePoints
# from .det3d_data_sample import Det3DDataSample
from .nerf_det3d_data_sample import NeRFDet3DDataSample
def to_tensor(
data: Union[torch.Tensor, np.ndarray, Sequence, int,
float]) -> torch.Tensor:
"""Convert objects of various python types to :obj:`torch.Tensor`.
Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
:class:`Sequence`, :class:`int` and :class:`float`.
Args:
data (torch.Tensor | numpy.ndarray | Sequence | int | float): Data to
be converted.
Returns:
torch.Tensor: the converted data.
"""
if isinstance(data, torch.Tensor):
return data
elif isinstance(data, np.ndarray):
if data.dtype is dtype('float64'):
data = data.astype(np.float32)
return torch.from_numpy(data)
elif isinstance(data, Sequence) and not mmengine.is_str(data):
return torch.tensor(data)
elif isinstance(data, int):
return torch.LongTensor([data])
elif isinstance(data, float):
return torch.FloatTensor([data])
else:
raise TypeError(f'type {type(data)} cannot be converted to tensor.')
@TRANSFORMS.register_module()
class PackNeRFDetInputs(BaseTransform):
INPUTS_KEYS = ['points', 'img']
NERF_INPUT_KEYS = [
'img', 'denorm_images', 'depth', 'lightpos', 'nerf_sizes', 'raydirs'
]
INSTANCEDATA_3D_KEYS = [
'gt_bboxes_3d', 'gt_labels_3d', 'attr_labels', 'depths', 'centers_2d'
]
INSTANCEDATA_2D_KEYS = [
'gt_bboxes',
'gt_bboxes_labels',
]
NERF_3D_KEYS = ['gt_images', 'gt_depths']
SEG_KEYS = [
'gt_seg_map', 'pts_instance_mask', 'pts_semantic_mask',
'gt_semantic_seg'
]
def __init__(
self,
keys: tuple,
meta_keys: tuple = ('img_path', 'ori_shape', 'img_shape', 'lidar2img',
'depth2img', 'cam2img', 'pad_shape',
'scale_factor', 'flip', 'pcd_horizontal_flip',
'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d',
'img_norm_cfg', 'num_pts_feats', 'pcd_trans',
'sample_idx', 'pcd_scale_factor', 'pcd_rotation',
'pcd_rotation_angle', 'lidar_path',
'transformation_3d_flow', 'trans_mat',
'affine_aug', 'sweep_img_metas', 'ori_cam2img',
'cam2global', 'crop_offset', 'img_crop_offset',
'resize_img_shape', 'lidar2cam', 'ori_lidar2img',
'num_ref_frames', 'num_views', 'ego2global',
'axis_align_matrix')
) -> None:
self.keys = keys
self.meta_keys = meta_keys
def _remove_prefix(self, key: str) -> str:
if key.startswith('gt_'):
key = key[3:]
return key
def transform(self, results: Union[dict,
List[dict]]) -> Union[dict, List[dict]]:
"""Method to pack the input data. when the value in this dict is a
list, it usually is in Augmentations Testing.
Args:
results (dict | list[dict]): Result dict from the data pipeline.
Returns:
dict | List[dict]:
- 'inputs' (dict): The forward data of models. It usually contains
following keys:
- points
- img
- 'data_samples' (:obj:`NeRFDet3DDataSample`): The annotation info
of the sample.
"""
# augtest
if isinstance(results, list):
if len(results) == 1:
# simple test
return self.pack_single_results(results[0])
pack_results = []
for single_result in results:
pack_results.append(self.pack_single_results(single_result))
return pack_results
# norm training and simple testing
elif isinstance(results, dict):
return self.pack_single_results(results)
else:
raise NotImplementedError
def pack_single_results(self, results: dict) -> dict:
"""Method to pack the single input data. when the value in this dict is
a list, it usually is in Augmentations Testing.
Args:
results (dict): Result dict from the data pipeline.
Returns:
dict: A dict contains
- 'inputs' (dict): The forward data of models. It usually contains
following keys:
- points
- img
- 'data_samples' (:obj:`NeRFDet3DDataSample`): The annotation info
of the sample.
"""
# Format 3D data
if 'points' in results:
if isinstance(results['points'], BasePoints):
results['points'] = results['points'].tensor
if 'img' in results:
if isinstance(results['img'], list):
# process multiple imgs in single frame
imgs = np.stack(results['img'], axis=0)
if imgs.flags.c_contiguous:
imgs = to_tensor(imgs).permute(0, 3, 1, 2).contiguous()
else:
imgs = to_tensor(
np.ascontiguousarray(imgs.transpose(0, 3, 1, 2)))
results['img'] = imgs
else:
img = results['img']
if len(img.shape) < 3:
img = np.expand_dims(img, -1)
# To improve the computational speed by by 3-5 times, apply:
# `torch.permute()` rather than `np.transpose()`.
# Refer to https://github.com/open-mmlab/mmdetection/pull/9533
# for more details
if img.flags.c_contiguous:
img = to_tensor(img).permute(2, 0, 1).contiguous()
else:
img = to_tensor(
np.ascontiguousarray(img.transpose(2, 0, 1)))
results['img'] = img
if 'depth' in results:
if isinstance(results['depth'], list):
# process multiple depth imgs in single frame
depth_imgs = np.stack(results['depth'], axis=0)
if depth_imgs.flags.c_contiguous:
depth_imgs = to_tensor(depth_imgs).contiguous()
else:
depth_imgs = to_tensor(np.ascontiguousarray(depth_imgs))
results['depth'] = depth_imgs
else:
depth_img = results['depth']
if len(depth_img.shape) < 3:
depth_img = np.expand_dims(depth_img, -1)
if depth_img.flags.c_contiguous:
depth_img = to_tensor(depth_img).contiguous()
else:
depth_img = to_tensor(np.ascontiguousarray(depth_img))
results['depth'] = depth_img
if 'ray_info' in results:
if isinstance(results['raydirs'], list):
raydirs = np.stack(results['raydirs'], axis=0)
if raydirs.flags.c_contiguous:
raydirs = to_tensor(raydirs).contiguous()
else:
raydirs = to_tensor(np.ascontiguousarray(raydirs))
results['raydirs'] = raydirs
if isinstance(results['lightpos'], list):
lightposes = np.stack(results['lightpos'], axis=0)
if lightposes.flags.c_contiguous:
lightposes = to_tensor(lightposes).contiguous()
else:
lightposes = to_tensor(np.ascontiguousarray(lightposes))
lightposes = lightposes.unsqueeze(1).repeat(
1, raydirs.shape[1], 1)
results['lightpos'] = lightposes
if isinstance(results['gt_images'], list):
gt_images = np.stack(results['gt_images'], axis=0)
if gt_images.flags.c_contiguous:
gt_images = to_tensor(gt_images).contiguous()
else:
gt_images = to_tensor(np.ascontiguousarray(gt_images))
results['gt_images'] = gt_images
if isinstance(results['gt_depths'],
list) and len(results['gt_depths']) != 0:
gt_depths = np.stack(results['gt_depths'], axis=0)
if gt_depths.flags.c_contiguous:
gt_depths = to_tensor(gt_depths).contiguous()
else:
gt_depths = to_tensor(np.ascontiguousarray(gt_depths))
results['gt_depths'] = gt_depths
if isinstance(results['denorm_images'], list):
denorm_imgs = np.stack(results['denorm_images'], axis=0)
if denorm_imgs.flags.c_contiguous:
denorm_imgs = to_tensor(denorm_imgs).permute(
0, 3, 1, 2).contiguous()
else:
denorm_imgs = to_tensor(
np.ascontiguousarray(
denorm_imgs.transpose(0, 3, 1, 2)))
results['denorm_images'] = denorm_imgs
for key in [
'proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_labels',
'gt_bboxes_labels', 'attr_labels', 'pts_instance_mask',
'pts_semantic_mask', 'centers_2d', 'depths', 'gt_labels_3d'
]:
if key not in results:
continue
if isinstance(results[key], list):
results[key] = [to_tensor(res) for res in results[key]]
else:
results[key] = to_tensor(results[key])
if 'gt_bboxes_3d' in results:
if not isinstance(results['gt_bboxes_3d'], BaseInstance3DBoxes):
results['gt_bboxes_3d'] = to_tensor(results['gt_bboxes_3d'])
if 'gt_semantic_seg' in results:
results['gt_semantic_seg'] = to_tensor(
results['gt_semantic_seg'][None])
if 'gt_seg_map' in results:
results['gt_seg_map'] = results['gt_seg_map'][None, ...]
if 'gt_images' in results:
results['gt_images'] = to_tensor(results['gt_images'])
if 'gt_depths' in results:
results['gt_depths'] = to_tensor(results['gt_depths'])
data_sample = NeRFDet3DDataSample()
gt_instances_3d = InstanceData()
gt_instances = InstanceData()
gt_pts_seg = PointData()
gt_nerf_images = InstanceData()
gt_nerf_depths = InstanceData()
data_metas = {}
for key in self.meta_keys:
if key in results:
data_metas[key] = results[key]
elif 'images' in results:
if len(results['images'].keys()) == 1:
cam_type = list(results['images'].keys())[0]
# single-view image
if key in results['images'][cam_type]:
data_metas[key] = results['images'][cam_type][key]
else:
# multi-view image
img_metas = []
cam_types = list(results['images'].keys())
for cam_type in cam_types:
if key in results['images'][cam_type]:
img_metas.append(results['images'][cam_type][key])
if len(img_metas) > 0:
data_metas[key] = img_metas
elif 'lidar_points' in results:
if key in results['lidar_points']:
data_metas[key] = results['lidar_points'][key]
data_sample.set_metainfo(data_metas)
inputs = {}
for key in self.keys:
if key in results:
# if key in self.INPUTS_KEYS:
if key in self.NERF_INPUT_KEYS:
inputs[key] = results[key]
elif key in self.NERF_3D_KEYS:
if key == 'gt_images':
gt_nerf_images[self._remove_prefix(key)] = results[key]
else:
gt_nerf_depths[self._remove_prefix(key)] = results[key]
elif key in self.INSTANCEDATA_3D_KEYS:
gt_instances_3d[self._remove_prefix(key)] = results[key]
elif key in self.INSTANCEDATA_2D_KEYS:
if key == 'gt_bboxes_labels':
gt_instances['labels'] = results[key]
else:
gt_instances[self._remove_prefix(key)] = results[key]
elif key in self.SEG_KEYS:
gt_pts_seg[self._remove_prefix(key)] = results[key]
else:
raise NotImplementedError(f'Please modified '
f'`Pack3DDetInputs` '
f'to put {key} to '
f'corresponding field')
data_sample.gt_instances_3d = gt_instances_3d
data_sample.gt_instances = gt_instances
data_sample.gt_pts_seg = gt_pts_seg
data_sample.gt_nerf_images = gt_nerf_images
data_sample.gt_nerf_depths = gt_nerf_depths
if 'eval_ann_info' in results:
data_sample.eval_ann_info = results['eval_ann_info']
else:
data_sample.eval_ann_info = None
packed_results = dict()
packed_results['data_samples'] = data_sample
packed_results['inputs'] = inputs
return packed_results
def __repr__(self) -> str:
"""str: Return a string that describes the module."""
repr_str = self.__class__.__name__
repr_str += f'(keys={self.keys})'
repr_str += f'(meta_keys={self.meta_keys})'
return repr_str
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment