Unverified Commit 583e9075 authored by Jingwei Zhang's avatar Jingwei Zhang Committed by GitHub
Browse files

[Feature] Support training of BEVFusion (#2558)

* support train on nus

* refactor transfusion head

* img branch optioinal

* support nuscenes_mini in replace_ceph_backend

* use replace_ceph

* add only-lidar

* use valid_flag in dataset filter

* support lidar-only training 69

* fix RTS

* fix rotation in ImgAug3D

* revert to original rotation in ImgAug3D

* add LSSDepthTransform and parse_losses

* fix LoadMultiSweeps

* fix bug about points in-place operations

* support amp and replace syncBN by BN

* add amp config

* set growth-interval in amp

* Revert "fix LoadMultiSweeps"

This reverts commit ab27ea1941f80c5d7fb3071005109f4794c9c9cb.

* add float in cls loss

* iter_based lr in fusion stage

* rename config

* use normalization query pos for stable training

* remove unnecessary code & simplify config & train 5 epoch

* smaller ete_min_ratio

* polish code

* fix UT

* Revert "use normalization query pos for stable training"

This reverts commit 30091188d4a665beaadcd5a9b49e04c9da47139e.

* update readme

* fix height offset
parent ed46b8c1
# Copyright (c) OpenMMLab. All rights reserved.
from mmengine.dataset import BaseDataset
from mmengine.hooks import Hook
from mmengine.model import is_model_wrapper
from mmengine.runner import Runner
......@@ -35,7 +36,11 @@ class DisableObjectSampleHook(Hook):
model = model.module
if epoch == self.disable_after_epoch:
runner.logger.info('Disable ObjectSample')
for transform in runner.train_dataloader.dataset.pipeline.transforms: # noqa: E501
dataset = runner.train_dataloader.dataset
# handle dataset wrapper
if not isinstance(dataset, BaseDataset):
dataset = dataset.dataset
for transform in dataset.pipeline.transforms: # noqa: E501
if isinstance(transform, ObjectSample):
assert hasattr(transform, 'disabled')
transform.disabled = True
......
......@@ -15,7 +15,7 @@ results is available at https://github.com/mit-han-lab/bevfusion.
## Introduction
We implement BEVFusion and provide the results and pretrained checkpoints on NuScenes dataset.
We implement BEVFusion and support training and testing on NuScenes dataset.
## Usage
......@@ -34,38 +34,41 @@ python projects/BEVFusion/setup.py develop
Run a demo on NuScenes data using [BEVFusion model](https://drive.google.com/file/d/1QkvbYDk4G2d6SZoeJqish13qSyXA4lp3/view?usp=share_link):
```shell
python demo/multi_modality_demo.py demo/data/nuscenes/n015-2018-07-24-11-22-45+0800__LIDAR_TOP__1532402927647951.pcd.bin demo/data/nuscenes/ demo/data/nuscenes/n015-2018-07-24-11-22-45+0800.pkl projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py ${CHECKPOINT_FILE} --cam-type all --score-thr 0.2 --show
python demo/multi_modality_demo.py demo/data/nuscenes/n015-2018-07-24-11-22-45+0800__LIDAR_TOP__1532402927647951.pcd.bin demo/data/nuscenes/ demo/data/nuscenes/n015-2018-07-24-11-22-45+0800.pkl projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py ${CHECKPOINT_FILE} --cam-type all --score-thr 0.2 --show
```
### Training commands
In MMDetection3D's root directory, run the following command to train the model:
1. You should train the lidar-only detector first:
```bash
python tools/train.py projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
bash tools/dist_train.py projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py 8
```
For multi-gpu training, run:
2. Download the [Swin pre-trained model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/bevfusion/swint-nuimages-pretrained.pth). Given the image pre-trained backbone and the lidar-only pre-trained detector, you could train the lidar-camera fusion model:
```bash
python -m torch.distributed.launch --nnodes=1 --node_rank=0 --nproc_per_node=${NUM_GPUS} --master_port=29506 --master_addr="127.0.0.1" tools/train.py projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
bash tools/dist_train.sh projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py 8 --cfg-options load_from=${LIDAR_PRETRAINED_CHECKPOINT} model.img_backbone.init_cfg.checkpoint=${IMAGE_PRETRAINED_BACKBONE}
```
**Note** that if you want to reduce CUDA memory usage and computational overhead, you could directly add `--amp` on the tail of the above commands. The model under this setting will be trained in fp16 mode.
### Testing commands
In MMDetection3D's root directory, run the following command to test the model:
```bash
python tools/test.py projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py ${CHECKPOINT_PATH}
bash tools/dist_test.sh projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py ${CHECKPOINT_PATH} 8
```
## Results and models
### NuScenes
| Backbone | Voxel type (voxel size) | NMS | Mem (GB) | Inf time (fps) | NDS | mAP | Download |
| :-----------------------------------------------------------------------------: | :---------------------: | :-: | :------: | :------------: | :---: | :---: | :------------------------------------------------------------------------------------------------------: |
| [SECFPN](./configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py) | voxel (0.075) | × | - | - | 71.62 | 68.77 | [converted_model](https://drive.google.com/file/d/1QkvbYDk4G2d6SZoeJqish13qSyXA4lp3/view?usp=share_link) |
| Modality | Voxel type (voxel size) | NMS | Mem (GB) | Inf time (fps) | NDS | mAP | Download |
| :------------------------------------------------------------------------------------------: | :---------------------: | :-: | :------: | :------------: | :--: | :--: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
| [lidar](./configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py) | voxel (0.075) | × | - | - | 69.6 | 64.9 | [model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/bevfusion/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d-2628f933.pth) [logs](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/bevfusion/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d_20230322_053447.log) |
| [lidar-cam](./configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py) | voxel (0.075) | × | - | - | 71.4 | 68.6 | [model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/bevfusion/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d-5239b1af.pth) [logs](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/bevfusion/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d_20230524_001539.log) |
## Citation
......@@ -103,9 +106,9 @@ A project does not necessarily have to be finished in a single PR, but it's esse
<!-- As this template does. -->
- [ ] Milestone 2: Indicates a successful model implementation.
- [x] Milestone 2: Indicates a successful model implementation.
- [ ] Training-time correctness
- [x] Training-time correctness
<!-- If you are reproducing the result from a paper, checking this item means that you should have trained your model from scratch based on the original paper's specification and verified that the final result matches the report within a minor error range. -->
......
from .bevfusion import BEVFusion
from .bevfusion_necks import GeneralizedLSSFPN
from .depth_lss import DepthLSSTransform
from .depth_lss import DepthLSSTransform, LSSTransform
from .loading import BEVLoadMultiViewImageFromFiles
from .sparse_encoder import BEVFusionSparseEncoder
from .transformer import TransformerDecoderLayer
from .transforms_3d import GridMask, ImageAug3D
from .transforms_3d import (BEVFusionGlobalRotScaleTrans,
BEVFusionRandomFlip3D, GridMask, ImageAug3D)
from .transfusion_head import ConvFuser, TransFusionHead
from .utils import (BBoxBEVL1Cost, HeuristicAssigner3D, HungarianAssigner3D,
IoU3DCost)
......@@ -12,7 +13,8 @@ from .utils import (BBoxBEVL1Cost, HeuristicAssigner3D, HungarianAssigner3D,
__all__ = [
'BEVFusion', 'TransFusionHead', 'ConvFuser', 'ImageAug3D', 'GridMask',
'GeneralizedLSSFPN', 'HungarianAssigner3D', 'BBoxBEVL1Cost', 'IoU3DCost',
'HeuristicAssigner3D', 'DepthLSSTransform',
'HeuristicAssigner3D', 'DepthLSSTransform', 'LSSTransform',
'BEVLoadMultiViewImageFromFiles', 'BEVFusionSparseEncoder',
'TransformerDecoderLayer'
'TransformerDecoderLayer', 'BEVFusionRandomFlip3D',
'BEVFusionGlobalRotScaleTrans'
]
from typing import Dict, List, Optional
from collections import OrderedDict
from copy import deepcopy
from typing import Dict, List, Optional, Tuple
import numpy as np
import torch
import torch.distributed as dist
from mmengine.utils import is_list_of
from torch import Tensor
from torch.nn import functional as F
......@@ -23,7 +27,7 @@ class BEVFusion(Base3DDetector):
fusion_layer: Optional[dict] = None,
img_backbone: Optional[dict] = None,
pts_backbone: Optional[dict] = None,
vtransform: Optional[dict] = None,
view_transform: Optional[dict] = None,
img_neck: Optional[dict] = None,
pts_neck: Optional[dict] = None,
bbox_head: Optional[dict] = None,
......@@ -40,20 +44,21 @@ class BEVFusion(Base3DDetector):
self.pts_voxel_encoder = MODELS.build(pts_voxel_encoder)
self.img_backbone = MODELS.build(img_backbone)
self.img_neck = MODELS.build(img_neck)
self.vtransform = MODELS.build(vtransform)
self.img_backbone = MODELS.build(
img_backbone) if img_backbone is not None else None
self.img_neck = MODELS.build(
img_neck) if img_neck is not None else None
self.view_transform = MODELS.build(
view_transform) if view_transform is not None else None
self.pts_middle_encoder = MODELS.build(pts_middle_encoder)
self.fusion_layer = MODELS.build(fusion_layer)
self.fusion_layer = MODELS.build(
fusion_layer) if fusion_layer is not None else None
self.pts_backbone = MODELS.build(pts_backbone)
self.pts_neck = MODELS.build(pts_neck)
self.bbox_head = MODELS.build(bbox_head)
# hard code here where using converted checkpoint of original
# implementation of `BEVFusion`
self.use_converted_checkpoint = True
self.init_weights()
......@@ -67,6 +72,46 @@ class BEVFusion(Base3DDetector):
"""
pass
def parse_losses(
self, losses: Dict[str, torch.Tensor]
) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
"""Parses the raw outputs (losses) of the network.
Args:
losses (dict): Raw output of the network, which usually contain
losses and other necessary information.
Returns:
tuple[Tensor, dict]: There are two elements. The first is the
loss tensor passed to optim_wrapper which may be a weighted sum
of all losses, and the second is log_vars which will be sent to
the logger.
"""
log_vars = []
for loss_name, loss_value in losses.items():
if isinstance(loss_value, torch.Tensor):
log_vars.append([loss_name, loss_value.mean()])
elif is_list_of(loss_value, torch.Tensor):
log_vars.append(
[loss_name,
sum(_loss.mean() for _loss in loss_value)])
else:
raise TypeError(
f'{loss_name} is not a tensor or list of tensors')
loss = sum(value for key, value in log_vars if 'loss' in key)
log_vars.insert(0, ['loss', loss])
log_vars = OrderedDict(log_vars) # type: ignore
for loss_name, loss_value in log_vars.items():
# reduce loss when distributed training
if dist.is_available() and dist.is_initialized():
loss_value = loss_value.data.clone()
dist.all_reduce(loss_value.div_(dist.get_world_size()))
log_vars[loss_name] = loss_value.item()
return loss, log_vars # type: ignore
def init_weights(self) -> None:
if self.img_backbone is not None:
self.img_backbone.init_weights()
......@@ -94,7 +139,7 @@ class BEVFusion(Base3DDetector):
img_metas,
) -> torch.Tensor:
B, N, C, H, W = x.size()
x = x.view(B * N, C, H, W)
x = x.view(B * N, C, H, W).contiguous()
x = self.img_backbone(x)
x = self.img_neck(x)
......@@ -105,7 +150,8 @@ class BEVFusion(Base3DDetector):
BN, C, H, W = x.size()
x = x.view(B, int(BN / B), C, H, W)
x = self.vtransform(
with torch.autocast(device_type='cuda', dtype=torch.float32):
x = self.view_transform(
x,
points,
lidar2image,
......@@ -119,6 +165,8 @@ class BEVFusion(Base3DDetector):
def extract_pts_feat(self, batch_inputs_dict) -> torch.Tensor:
points = batch_inputs_dict['points']
with torch.autocast('cuda', enabled=False):
points = [point.float() for point in points]
feats, coords, sizes = self.voxelize(points)
batch_size = coords[-1, 0] + 1
x = self.pts_middle_encoder(feats, coords, batch_size)
......@@ -184,11 +232,6 @@ class BEVFusion(Base3DDetector):
if self.with_bbox_head:
outputs = self.bbox_head.predict(feats, batch_input_metas)
if self.use_converted_checkpoint:
outputs[0]['bboxes_3d'].tensor[:, 6] = -outputs[0][
'bboxes_3d'].tensor[:, 6] - np.pi / 2
outputs[0]['bboxes_3d'].tensor[:, 3:5] = outputs[0][
'bboxes_3d'].tensor[:, [4, 3]]
res = self.add_pred_to_datasample(batch_data_samples, outputs)
......@@ -202,7 +245,9 @@ class BEVFusion(Base3DDetector):
):
imgs = batch_inputs_dict.get('imgs', None)
points = batch_inputs_dict.get('points', None)
features = []
if imgs is not None:
imgs = imgs.contiguous()
lidar2image, camera_intrinsics, camera2lidar = [], [], []
img_aug_matrix, lidar_aug_matrix = [], []
for i, meta in enumerate(batch_input_metas):
......@@ -210,20 +255,22 @@ class BEVFusion(Base3DDetector):
camera_intrinsics.append(meta['cam2img'])
camera2lidar.append(meta['cam2lidar'])
img_aug_matrix.append(meta.get('img_aug_matrix', np.eye(4)))
lidar_aug_matrix.append(meta.get('lidar_aug_matrix', np.eye(4)))
lidar_aug_matrix.append(
meta.get('lidar_aug_matrix', np.eye(4)))
lidar2image = imgs.new_tensor(np.asarray(lidar2image))
camera_intrinsics = imgs.new_tensor(np.array(camera_intrinsics))
camera2lidar = imgs.new_tensor(np.asarray(camera2lidar))
img_aug_matrix = imgs.new_tensor(np.asarray(img_aug_matrix))
lidar_aug_matrix = imgs.new_tensor(np.asarray(lidar_aug_matrix))
img_feature = self.extract_img_feat(imgs, points, lidar2image,
camera_intrinsics, camera2lidar,
img_aug_matrix, lidar_aug_matrix,
img_feature = self.extract_img_feat(imgs, deepcopy(points),
lidar2image, camera_intrinsics,
camera2lidar, img_aug_matrix,
lidar_aug_matrix,
batch_input_metas)
features.append(img_feature)
pts_feature = self.extract_pts_feat(batch_inputs_dict)
features = [img_feature, pts_feature]
features.append(pts_feature)
if self.fusion_layer is not None:
x = self.fusion_layer(features)
......@@ -239,4 +286,13 @@ class BEVFusion(Base3DDetector):
def loss(self, batch_inputs_dict: Dict[str, Optional[Tensor]],
batch_data_samples: List[Det3DDataSample],
**kwargs) -> List[Det3DDataSample]:
pass
batch_input_metas = [item.metainfo for item in batch_data_samples]
feats = self.extract_feat(batch_inputs_dict, batch_input_metas)
losses = dict()
if self.with_bbox_head:
bbox_loss = self.bbox_head.loss(feats, batch_data_samples)
losses.update(bbox_loss)
return losses
......@@ -17,7 +17,7 @@ def gen_dx_bx(xbound, ybound, zbound):
return dx, bx, nx
class BaseTransform(nn.Module):
class BaseViewTransform(nn.Module):
def __init__(
self,
......@@ -156,6 +156,7 @@ class BaseTransform(nn.Module):
camera2lidar,
img_aug_matrix,
lidar_aug_matrix,
metas,
**kwargs,
):
intrins = camera_intrinsics[..., :3, :3]
......@@ -182,7 +183,77 @@ class BaseTransform(nn.Module):
return x
class BaseDepthTransform(BaseTransform):
@MODELS.register_module()
class LSSTransform(BaseViewTransform):
def __init__(
self,
in_channels: int,
out_channels: int,
image_size: Tuple[int, int],
feature_size: Tuple[int, int],
xbound: Tuple[float, float, float],
ybound: Tuple[float, float, float],
zbound: Tuple[float, float, float],
dbound: Tuple[float, float, float],
downsample: int = 1,
) -> None:
super().__init__(
in_channels=in_channels,
out_channels=out_channels,
image_size=image_size,
feature_size=feature_size,
xbound=xbound,
ybound=ybound,
zbound=zbound,
dbound=dbound,
)
self.depthnet = nn.Conv2d(in_channels, self.D + self.C, 1)
if downsample > 1:
assert downsample == 2, downsample
self.downsample = nn.Sequential(
nn.Conv2d(
out_channels, out_channels, 3, padding=1, bias=False),
nn.BatchNorm2d(out_channels),
nn.ReLU(True),
nn.Conv2d(
out_channels,
out_channels,
3,
stride=downsample,
padding=1,
bias=False,
),
nn.BatchNorm2d(out_channels),
nn.ReLU(True),
nn.Conv2d(
out_channels, out_channels, 3, padding=1, bias=False),
nn.BatchNorm2d(out_channels),
nn.ReLU(True),
)
else:
self.downsample = nn.Identity()
def get_cam_feats(self, x):
B, N, C, fH, fW = x.shape
x = x.view(B * N, C, fH, fW)
x = self.depthnet(x)
depth = x[:, :self.D].softmax(dim=1)
x = depth.unsqueeze(1) * x[:, self.D:(self.D + self.C)].unsqueeze(2)
x = x.view(B, N, self.C, self.D, fH, fW)
x = x.permute(0, 1, 3, 4, 5, 2)
return x
def forward(self, *args, **kwargs):
x = super().forward(*args, **kwargs)
x = self.downsample(x)
return x
class BaseDepthTransform(BaseViewTransform):
def forward(
self,
......@@ -202,8 +273,6 @@ class BaseDepthTransform(BaseTransform):
camera2lidar_rots = camera2lidar[..., :3, :3]
camera2lidar_trans = camera2lidar[..., :3, 3]
# print(img.shape, self.image_size, self.feature_size)
batch_size = len(points)
depth = torch.zeros(batch_size, img.shape[1], 1,
*self.image_size).to(points[0].device)
......@@ -241,6 +310,7 @@ class BaseDepthTransform(BaseTransform):
for c in range(on_img.shape[0]):
masked_coords = cur_coords[c, on_img[c]].long()
masked_dist = dist[c, on_img[c]]
depth = depth.to(masked_dist.dtype)
depth[b, c, 0, masked_coords[:, 0],
masked_coords[:, 1]] = masked_dist
......@@ -276,6 +346,8 @@ class DepthLSSTransform(BaseDepthTransform):
dbound: Tuple[float, float, float],
downsample: int = 1,
) -> None:
"""Compared with `LSSTransform`, `DepthLSSTransform` adds sparse depth
information from lidar points into the inputs of the `depthnet`."""
super().__init__(
in_channels=in_channels,
out_channels=out_channels,
......
......@@ -6,6 +6,7 @@ import torch
from mmcv.transforms import BaseTransform
from PIL import Image
from mmdet3d.datasets import GlobalRotScaleTrans
from mmdet3d.registry import TRANSFORMS
......@@ -107,6 +108,84 @@ class ImageAug3D(BaseTransform):
return data
@TRANSFORMS.register_module()
class BEVFusionRandomFlip3D:
"""Compared with `RandomFlip3D`, this class directly records the lidar
augmentation matrix in the `data`."""
def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
flip_horizontal = np.random.choice([0, 1])
flip_vertical = np.random.choice([0, 1])
rotation = np.eye(3)
if flip_horizontal:
rotation = np.array([[1, 0, 0], [0, -1, 0], [0, 0, 1]]) @ rotation
if 'points' in data:
data['points'].flip('horizontal')
if 'gt_bboxes_3d' in data:
data['gt_bboxes_3d'].flip('horizontal')
if 'gt_masks_bev' in data:
data['gt_masks_bev'] = data['gt_masks_bev'][:, :, ::-1].copy()
if flip_vertical:
rotation = np.array([[-1, 0, 0], [0, 1, 0], [0, 0, 1]]) @ rotation
if 'points' in data:
data['points'].flip('vertical')
if 'gt_bboxes_3d' in data:
data['gt_bboxes_3d'].flip('vertical')
if 'gt_masks_bev' in data:
data['gt_masks_bev'] = data['gt_masks_bev'][:, ::-1, :].copy()
if 'lidar_aug_matrix' not in data:
data['lidar_aug_matrix'] = np.eye(4)
data['lidar_aug_matrix'][:3, :] = rotation @ data[
'lidar_aug_matrix'][:3, :]
return data
@TRANSFORMS.register_module()
class BEVFusionGlobalRotScaleTrans(GlobalRotScaleTrans):
"""Compared with `GlobalRotScaleTrans`, the augmentation order in this
class is rotation, translation and scaling (RTS)."""
def transform(self, input_dict: dict) -> dict:
"""Private function to rotate, scale and translate bounding boxes and
points.
Args:
input_dict (dict): Result dict from loading pipeline.
Returns:
dict: Results after scaling, 'points', 'pcd_rotation',
'pcd_scale_factor', 'pcd_trans' and `gt_bboxes_3d` are updated
in the result dict.
"""
if 'transformation_3d_flow' not in input_dict:
input_dict['transformation_3d_flow'] = []
self._rot_bbox_points(input_dict)
if 'pcd_scale_factor' not in input_dict:
self._random_scale(input_dict)
self._trans_bbox_points(input_dict)
self._scale_bbox_points(input_dict)
input_dict['transformation_3d_flow'].extend(['R', 'T', 'S'])
lidar_augs = np.eye(4)
lidar_augs[:3, :3] = input_dict['pcd_rotation'].T * input_dict[
'pcd_scale_factor']
lidar_augs[:3, 3] = input_dict['pcd_trans'] * \
input_dict['pcd_scale_factor']
if 'lidar_aug_matrix' not in input_dict:
input_dict['lidar_aug_matrix'] = np.eye(4)
input_dict[
'lidar_aug_matrix'] = lidar_augs @ input_dict['lidar_aug_matrix']
return input_dict
@TRANSFORMS.register_module()
class GridMask(BaseTransform):
......
# modify from https://github.com/mit-han-lab/bevfusion
import copy
from typing import List
from typing import List, Tuple
import numpy as np
import torch
......@@ -75,8 +75,6 @@ class TransFusionHead(nn.Module):
):
super(TransFusionHead, self).__init__()
self.fp16_enabled = False
self.num_classes = num_classes
self.num_proposals = num_proposals
self.auxiliary = auxiliary
......@@ -226,7 +224,8 @@ class TransFusionHead(nn.Module):
#################################
# query initialization
#################################
dense_heatmap = self.heatmap_head(fusion_feat)
with torch.autocast('cuda', enabled=False):
dense_heatmap = self.heatmap_head(fusion_feat.float())
heatmap = dense_heatmap.detach().sigmoid()
padding = self.nms_kernel_size // 2
local_max = torch.zeros_like(heatmap)
......@@ -482,8 +481,6 @@ class TransFusionHead(nn.Module):
ret = dict(bboxes=boxes3d, scores=scores, labels=labels)
temp_instances = InstanceData()
ret['bboxes'][:, 2] = ret[
'bboxes'][:, 2] - ret['bboxes'][:, 5] * 0.5 # noqa: E501
temp_instances.bboxes_3d = metas[0]['box_type_3d'](
ret['bboxes'], box_dim=ret['bboxes'].shape[-1])
temp_instances.scores_3d = ret['scores']
......@@ -498,12 +495,23 @@ class TransFusionHead(nn.Module):
return rets[0]
def get_targets(self, gt_bboxes_3d, gt_labels_3d, preds_dict):
def get_targets(self, batch_gt_instances_3d: List[InstanceData],
preds_dict: List[dict]):
"""Generate training targets.
Args:
gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): Ground truth gt boxes.
gt_labels_3d (torch.Tensor): Labels of boxes.
preds_dicts (tuple of dict): first index by layer (default 1)
batch_gt_instances_3d (List[InstanceData]):
preds_dict (list[dict]): The prediction results. The index of the
list is the index of layers. The inner dict contains
predictions of one mini-batch:
- center: (bs, 2, num_proposals)
- height: (bs, 1, num_proposals)
- dim: (bs, 3, num_proposals)
- rot: (bs, 2, num_proposals)
- vel: (bs, 2, num_proposals)
- cls_logit: (bs, num_classes, num_proposals)
- query_score: (bs, num_classes, num_proposals)
- heatmap: The original heatmap before fed into transformer
decoder, with shape (bs, 10, h, w)
Returns:
tuple[torch.Tensor]: Tuple of target including \
the following results in order.
......@@ -516,20 +524,23 @@ class TransFusionHead(nn.Module):
# change preds_dict into list of dict (index by batch_id)
# preds_dict[0]['center'].shape [bs, 3, num_proposal]
list_of_pred_dict = []
for batch_idx in range(len(gt_bboxes_3d)):
for batch_idx in range(len(batch_gt_instances_3d)):
pred_dict = {}
for key in preds_dict[0].keys():
pred_dict[key] = preds_dict[0][key][batch_idx:batch_idx + 1]
preds = []
for i in range(self.num_decoder_layers):
pred_one_layer = preds_dict[i][key][batch_idx:batch_idx +
1]
preds.append(pred_one_layer)
pred_dict[key] = torch.cat(preds)
list_of_pred_dict.append(pred_dict)
assert len(gt_bboxes_3d) == len(list_of_pred_dict)
assert len(batch_gt_instances_3d) == len(list_of_pred_dict)
res_tuple = multi_apply(
self.get_targets_single,
gt_bboxes_3d,
gt_labels_3d,
batch_gt_instances_3d,
list_of_pred_dict,
np.arange(len(gt_labels_3d)),
np.arange(len(batch_gt_instances_3d)),
)
labels = torch.cat(res_tuple[0], dim=0)
label_weights = torch.cat(res_tuple[1], dim=0)
......@@ -550,23 +561,26 @@ class TransFusionHead(nn.Module):
heatmap,
)
def get_targets_single(self, gt_bboxes_3d, gt_labels_3d, preds_dict,
batch_idx):
def get_targets_single(self, gt_instances_3d, preds_dict, batch_idx):
"""Generate training targets for a single sample.
Args:
gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): Ground truth gt boxes.
gt_labels_3d (torch.Tensor): Labels of boxes.
preds_dict (dict): dict of prediction result for a single sample
gt_instances_3d (:obj:`InstanceData`): ground truth of instances.
preds_dict (dict): dict of prediction result for a single sample.
Returns:
tuple[torch.Tensor]: Tuple of target including \
the following results in order.
- torch.Tensor: classification target. [1, num_proposals]
- torch.Tensor: classification weights (mask) [1, num_proposals] # noqa: E501
- torch.Tensor: classification weights (mask) [1,
num_proposals] # noqa: E501
- torch.Tensor: regression target. [1, num_proposals, 8]
- torch.Tensor: regression weights. [1, num_proposals, 8]
- torch.Tensor: iou target. [1, num_proposals]
- int: number of positive proposals
- torch.Tensor: heatmap targets.
"""
# 1. Assignment
gt_bboxes_3d = gt_instances_3d.bboxes_3d
gt_labels_3d = gt_instances_3d.labels_3d
num_proposals = preds_dict['center'].shape[-1]
# get pred boxes, carefully ! don't change the network outputs
......@@ -628,14 +642,19 @@ class TransFusionHead(nn.Module):
[res.max_overlaps for res in assign_result_list]),
labels=torch.cat([res.labels for res in assign_result_list]),
)
# 2. Sampling. Compatible with the interface of `PseudoSampler` in
# mmdet.
gt_instances, pred_instances = InstanceData(
bboxes=gt_bboxes_tensor), InstanceData(priors=bboxes_tensor)
sampling_result = self.bbox_sampler.sample(assign_result_ensemble,
bboxes_tensor,
gt_bboxes_tensor)
pred_instances,
gt_instances)
pos_inds = sampling_result.pos_inds
neg_inds = sampling_result.neg_inds
assert len(pos_inds) + len(neg_inds) == num_proposals
# create target for loss computation
# 3. Create target for loss computation
bbox_targets = torch.zeros([num_proposals, self.bbox_coder.code_size
]).to(center.device)
bbox_weights = torch.zeros([num_proposals, self.bbox_coder.code_size
......@@ -723,17 +742,29 @@ class TransFusionHead(nn.Module):
heatmap[None],
)
def loss(self, gt_bboxes_3d, gt_labels_3d, preds_dicts, **kwargs):
def loss(self, batch_feats, batch_data_samples):
"""Loss function for CenterHead.
Args:
gt_bboxes_3d (list[:obj:`LiDARInstance3DBoxes`]): Ground
truth gt boxes.
gt_labels_3d (list[torch.Tensor]): Labels of boxes.
preds_dicts (list[list[dict]]): Output of forward function.
batch_feats (): Features in a batch.
batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
Samples. It usually includes information such as
`gt_instance_3d`.
Returns:
dict[str:torch.Tensor]: Loss of heatmap and bbox of each task.
"""
batch_input_metas, batch_gt_instances_3d = [], []
for data_sample in batch_data_samples:
batch_input_metas.append(data_sample.metainfo)
batch_gt_instances_3d.append(data_sample.gt_instances_3d)
preds_dicts = self(batch_feats, batch_input_metas)
loss = self.loss_by_feat(preds_dicts, batch_gt_instances_3d)
return loss
def loss_by_feat(self, preds_dicts: Tuple[List[dict]],
batch_gt_instances_3d: List[InstanceData], *args,
**kwargs):
(
labels,
label_weights,
......@@ -743,7 +774,7 @@ class TransFusionHead(nn.Module):
num_pos,
matched_ious,
heatmap,
) = self.get_targets(gt_bboxes_3d, gt_labels_3d, preds_dicts[0])
) = self.get_targets(batch_gt_instances_3d, preds_dicts[0])
if hasattr(self, 'on_the_image_mask'):
label_weights = label_weights * self.on_the_image_mask
bbox_weights = bbox_weights * self.on_the_image_mask[:, :, None]
......@@ -753,8 +784,8 @@ class TransFusionHead(nn.Module):
# compute heatmap loss
loss_heatmap = self.loss_heatmap(
clip_sigmoid(preds_dict['dense_heatmap']),
heatmap,
clip_sigmoid(preds_dict['dense_heatmap']).float(),
heatmap.float(),
avg_factor=max(heatmap.eq(1).float().sum().item(), 1),
)
loss_dict['loss_heatmap'] = loss_heatmap
......@@ -781,7 +812,7 @@ class TransFusionHead(nn.Module):
layer_cls_score = layer_score.permute(0, 2, 1).reshape(
-1, self.num_classes)
layer_loss_cls = self.loss_cls(
layer_cls_score,
layer_cls_score.float(),
layer_labels,
layer_label_weights,
avg_factor=max(num_pos, 1),
......
......@@ -7,6 +7,8 @@ try:
except ImportError:
linear_sum_assignment = None
from mmengine.structures import InstanceData
from mmdet3d.registry import TASK_UTILS
......@@ -273,8 +275,11 @@ class HungarianAssigner3D(BaseAssigner):
num_gts, assigned_gt_inds, None, labels=assigned_labels)
# 2. compute the weighted costs
# see mmdetection/mmdet/core/bbox/match_costs/match_cost.py
cls_cost = self.cls_cost(cls_pred[0].T, gt_labels)
# Hard code here to be compatible with the interface of
# `ClassificationCost` in mmdet.
gt_instances, pred_instances = InstanceData(
labels=gt_labels), InstanceData(scores=cls_pred[0].T)
cls_cost = self.cls_cost(pred_instances, gt_instances)
reg_cost = self.reg_cost(bboxes, gt_bboxes, train_cfg)
iou = self.iou_calculator(bboxes, gt_bboxes)
iou_cost = self.iou_cost(iou)
......
_base_ = [
'./bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py'
]
point_cloud_range = [-54.0, -54.0, -5.0, 54.0, 54.0, 3.0]
input_modality = dict(use_lidar=True, use_camera=True)
backend_args = None
model = dict(
type='BEVFusion',
data_preprocessor=dict(
type='Det3DDataPreprocessor',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=False),
img_backbone=dict(
type='mmdet.SwinTransformer',
embed_dims=96,
depths=[2, 2, 6, 2],
num_heads=[3, 6, 12, 24],
window_size=7,
mlp_ratio=4,
qkv_bias=True,
qk_scale=None,
drop_rate=0.0,
attn_drop_rate=0.0,
drop_path_rate=0.2,
patch_norm=True,
out_indices=[1, 2, 3],
with_cp=False,
convert_weights=True,
init_cfg=dict(
type='Pretrained',
checkpoint= # noqa: E251
'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth' # noqa: E501
)),
img_neck=dict(
type='GeneralizedLSSFPN',
in_channels=[192, 384, 768],
out_channels=256,
start_level=0,
num_outs=3,
norm_cfg=dict(type='BN2d', requires_grad=True),
act_cfg=dict(type='ReLU', inplace=True),
upsample_cfg=dict(mode='bilinear', align_corners=False)),
view_transform=dict(
type='DepthLSSTransform',
in_channels=256,
out_channels=80,
image_size=[256, 704],
feature_size=[32, 88],
xbound=[-54.0, 54.0, 0.3],
ybound=[-54.0, 54.0, 0.3],
zbound=[-10.0, 10.0, 20.0],
dbound=[1.0, 60.0, 0.5],
downsample=2),
fusion_layer=dict(
type='ConvFuser', in_channels=[80, 256], out_channels=256))
train_pipeline = [
dict(
type='BEVLoadMultiViewImageFromFiles',
to_float32=True,
color_type='color',
backend_args=backend_args),
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=5,
backend_args=backend_args),
dict(
type='LoadPointsFromMultiSweeps',
sweeps_num=9,
load_dim=5,
use_dim=5,
pad_empty_sweeps=True,
remove_close=True,
backend_args=backend_args),
dict(
type='LoadAnnotations3D',
with_bbox_3d=True,
with_label_3d=True,
with_attr_label=False),
dict(
type='ImageAug3D',
final_dim=[256, 704],
resize_lim=[0.38, 0.55],
bot_pct_lim=[0.0, 0.0],
rot_lim=[-5.4, 5.4],
rand_flip=True,
is_train=True),
dict(
type='BEVFusionGlobalRotScaleTrans',
scale_ratio_range=[0.9, 1.1],
rot_range=[-0.78539816, 0.78539816],
translation_std=0.5),
dict(type='BEVFusionRandomFlip3D'),
dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
dict(
type='ObjectNameFilter',
classes=[
'car', 'truck', 'construction_vehicle', 'bus', 'trailer',
'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
]),
# Actually, 'GridMask' is not used here
dict(
type='GridMask',
use_h=True,
use_w=True,
max_epoch=6,
rotate=1,
offset=False,
ratio=0.5,
mode=1,
prob=0.0,
fixed_prob=True),
dict(type='PointShuffle'),
dict(
type='Pack3DDetInputs',
keys=[
'points', 'img', 'gt_bboxes_3d', 'gt_labels_3d', 'gt_bboxes',
'gt_labels'
],
meta_keys=[
'cam2img', 'ori_cam2img', 'lidar2cam', 'lidar2img', 'cam2lidar',
'ori_lidar2img', 'img_aug_matrix', 'box_type_3d', 'sample_idx',
'lidar_path', 'img_path', 'transformation_3d_flow', 'pcd_rotation',
'pcd_scale_factor', 'pcd_trans', 'img_aug_matrix',
'lidar_aug_matrix'
])
]
test_pipeline = [
dict(
type='BEVLoadMultiViewImageFromFiles',
to_float32=True,
color_type='color',
backend_args=backend_args),
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=5,
backend_args=backend_args),
dict(
type='LoadPointsFromMultiSweeps',
sweeps_num=9,
load_dim=5,
use_dim=5,
pad_empty_sweeps=True,
remove_close=True,
backend_args=backend_args),
dict(
type='ImageAug3D',
final_dim=[256, 704],
resize_lim=[0.48, 0.48],
bot_pct_lim=[0.0, 0.0],
rot_lim=[0.0, 0.0],
rand_flip=False,
is_train=False),
dict(
type='PointsRangeFilter',
point_cloud_range=[-54.0, -54.0, -5.0, 54.0, 54.0, 3.0]),
dict(
type='Pack3DDetInputs',
keys=['img', 'points', 'gt_bboxes_3d', 'gt_labels_3d'],
meta_keys=[
'cam2img', 'ori_cam2img', 'lidar2cam', 'lidar2img', 'cam2lidar',
'ori_lidar2img', 'img_aug_matrix', 'box_type_3d', 'sample_idx',
'lidar_path', 'img_path'
])
]
train_dataloader = dict(
dataset=dict(
dataset=dict(pipeline=train_pipeline, modality=input_modality)))
val_dataloader = dict(
dataset=dict(pipeline=test_pipeline, modality=input_modality))
test_dataloader = val_dataloader
param_scheduler = [
dict(
type='LinearLR',
start_factor=0.33333333,
by_epoch=False,
begin=0,
end=500),
dict(
type='CosineAnnealingLR',
begin=0,
T_max=6,
end=6,
by_epoch=True,
eta_min_ratio=1e-4,
convert_to_iter_based=True),
# momentum scheduler
# During the first 8 epochs, momentum increases from 1 to 0.85 / 0.95
# during the next 12 epochs, momentum increases from 0.85 / 0.95 to 1
dict(
type='CosineAnnealingMomentum',
eta_min=0.85 / 0.95,
begin=0,
end=2.4,
by_epoch=True,
convert_to_iter_based=True),
dict(
type='CosineAnnealingMomentum',
eta_min=1,
begin=2.4,
end=6,
by_epoch=True,
convert_to_iter_based=True)
]
# runtime settings
train_cfg = dict(by_epoch=True, max_epochs=6, val_interval=1)
val_cfg = dict()
test_cfg = dict()
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.01),
clip_grad=dict(max_norm=35, norm_type=2))
# Default setting for scaling LR automatically
# - `enable` means enable scaling LR automatically
# or not by default.
# - `base_batch_size` = (8 GPUs) x (4 samples per GPU).
auto_scale_lr = dict(enable=False, base_batch_size=32)
default_hooks = dict(
logger=dict(type='LoggerHook', interval=50),
checkpoint=dict(type='CheckpointHook', interval=1))
del _base_.custom_hooks
......@@ -26,16 +26,25 @@ data_prefix = dict(
CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT',
CAM_BACK_LEFT='samples/CAM_BACK_LEFT',
sweeps='sweeps/LIDAR_TOP')
input_modality = dict(use_lidar=True, use_camera=True)
input_modality = dict(use_lidar=True, use_camera=False)
# backend_args = dict(
# backend='petrel',
# path_mapping=dict({
# './data/nuscenes/':
# 's3://openmmlab/datasets/detection3d/nuscenes/',
# 'data/nuscenes/':
# 's3://openmmlab/datasets/detection3d/nuscenes/',
# './data/nuscenes_mini/':
# 's3://openmmlab/datasets/detection3d/nuscenes/',
# 'data/nuscenes_mini/':
# 's3://openmmlab/datasets/detection3d/nuscenes/'
# }))
backend_args = None
model = dict(
type='BEVFusion',
data_preprocessor=dict(
type='Det3DDataPreprocessor',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
bgr_to_rgb=False,
pad_size_divisor=32,
voxelize_cfg=dict(
max_num_points=10,
......@@ -43,74 +52,31 @@ model = dict(
voxel_size=[0.075, 0.075, 0.2],
max_voxels=[120000, 160000],
voxelize_reduce=True)),
img_backbone=dict(
type='mmdet.SwinTransformer',
embed_dims=96,
depths=[2, 2, 6, 2],
num_heads=[3, 6, 12, 24],
window_size=7,
mlp_ratio=4,
qkv_bias=True,
qk_scale=None,
drop_rate=0.0,
attn_drop_rate=0.0,
drop_path_rate=0.2,
patch_norm=True,
out_indices=[1, 2, 3],
with_cp=False,
convert_weights=True,
init_cfg=dict(
type='Pretrained',
checkpoint= # noqa: E251
'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth' # noqa: E501
)),
img_neck=dict(
type='GeneralizedLSSFPN',
in_channels=[192, 384, 768],
out_channels=256,
start_level=0,
num_outs=3,
norm_cfg=dict(type='BN2d', requires_grad=True),
act_cfg=dict(type='ReLU', inplace=True),
upsample_cfg=dict(mode='bilinear', align_corners=False)),
vtransform=dict(
type='DepthLSSTransform',
in_channels=256,
out_channels=80,
image_size=[256, 704],
feature_size=[32, 88],
xbound=[-54.0, 54.0, 0.3],
ybound=[-54.0, 54.0, 0.3],
zbound=[-10.0, 10.0, 20.0],
dbound=[1.0, 60.0, 0.5],
downsample=2),
pts_voxel_encoder=dict(type='HardSimpleVFE', num_features=5),
pts_middle_encoder=dict(
type='BEVFusionSparseEncoder',
in_channels=5,
sparse_shape=[1440, 1440, 41],
order=('conv', 'norm', 'act'),
norm_cfg=dict(type='SyncBN', eps=0.001, momentum=0.01),
norm_cfg=dict(type='BN1d', eps=0.001, momentum=0.01),
encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128,
128)),
encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, (1, 1, 0)), (0, 0)),
block_type='basicblock'),
fusion_layer=dict(
type='ConvFuser', in_channels=[80, 256], out_channels=256),
pts_backbone=dict(
type='SECOND',
in_channels=256,
out_channels=[128, 256],
layer_nums=[5, 5],
layer_strides=[1, 2],
norm_cfg=dict(type='SyncBN', eps=0.001, momentum=0.01),
norm_cfg=dict(type='BN', eps=0.001, momentum=0.01),
conv_cfg=dict(type='Conv2d', bias=False)),
pts_neck=dict(
type='SECONDFPN',
in_channels=[128, 256],
out_channels=[256, 256],
upsample_strides=[1, 2],
norm_cfg=dict(type='SyncBN', eps=0.001, momentum=0.01),
norm_cfg=dict(type='BN', eps=0.001, momentum=0.01),
upsample_cfg=dict(type='deconv', bias=False),
use_conv_for_no_stride=True),
bbox_head=dict(
......@@ -191,9 +157,7 @@ db_sampler = dict(
rate=1.0,
prepare=dict(
filter_by_difficulty=[-1],
filter_by_min_points=dict(Car=5, Pedestrian=5, Cyclist=5)),
classes=class_names,
sample_groups=dict(
filter_by_min_points=dict(
car=5,
truck=5,
bus=5,
......@@ -203,61 +167,53 @@ db_sampler = dict(
barrier=5,
motorcycle=5,
bicycle=5,
pedestrian=5),
pedestrian=5)),
classes=class_names,
sample_groups=dict(
car=2,
truck=3,
construction_vehicle=7,
bus=4,
trailer=6,
barrier=2,
motorcycle=6,
bicycle=6,
pedestrian=2,
traffic_cone=2),
points_loader=dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=[0, 1, 2, 3, 4],
reduce_beams=32,
backend_args=backend_args),
backend_args=backend_args)
backend_args=backend_args))
train_pipeline = [
dict(
type='BEVLoadMultiViewImageFromFiles',
to_float32=True,
color_type='color',
backend_args=backend_args),
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=5,
reduce_beams=32,
load_augmented=None,
backend_args=backend_args),
dict(
type='LoadPointsFromMultiSweeps',
sweeps_num=9,
load_dim=5,
use_dim=5,
reduce_beams=32,
pad_empty_sweeps=True,
remove_close=True,
load_augmented=None,
backend_args=backend_args),
dict(
type='LoadAnnotations3D',
with_bbox_3d=True,
with_label_3d=True,
with_attr_label=False),
# dict(type='ObjectSampling', db_sampler=db_sampler),
dict(
type='ImageAug3D',
final_dim=[256, 704],
resize_lim=[0.38, 0.55],
bot_pct_lim=[0.0, 0.0],
rot_lim=[-5.4, 5.4],
rand_flip=True,
is_train=True),
dict(type='ObjectSample', db_sampler=db_sampler),
dict(
type='GlobalRotScaleTrans',
resize_lim=[0.9, 1.1],
rot_lim=[-0.78539816, 0.78539816],
trans_lim=0.5,
is_train=True),
dict(type='RandomFlip3D'),
scale_ratio_range=[0.9, 1.1],
rot_range=[-0.78539816, 0.78539816],
translation_std=0.5),
dict(type='BEVFusionRandomFlip3D'),
dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
dict(
......@@ -266,32 +222,23 @@ train_pipeline = [
'car', 'truck', 'construction_vehicle', 'bus', 'trailer',
'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
]),
dict(
type='GridMask',
use_h=True,
use_w=True,
max_epoch=6,
rotate=1,
offset=False,
ratio=0.5,
mode=1,
prob=0.0,
fixed_prob=True),
dict(type='PointShuffle'),
dict(
type='Pack3DDetInputs',
keys=[
'points', 'img', 'gt_bboxes_3d', 'gt_labels_3d', 'gt_bboxes',
'gt_labels'
],
meta_keys=[
'cam2img', 'ori_cam2img', 'lidar2cam', 'lidar2img', 'cam2lidar',
'ori_lidar2img', 'img_aug_matrix', 'box_type_3d', 'sample_idx',
'lidar_path', 'img_path', 'transformation_3d_flow', 'pcd_rotation',
'pcd_scale_factor', 'pcd_trans', 'img_aug_matrix',
'lidar_aug_matrix'
])
]
test_pipeline = [
dict(
type='BEVLoadMultiViewImageFromFiles',
to_float32=True,
color_type='color',
backend_args=backend_args),
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
......@@ -306,14 +253,6 @@ test_pipeline = [
pad_empty_sweeps=True,
remove_close=True,
backend_args=backend_args),
dict(
type='ImageAug3D',
final_dim=[256, 704],
resize_lim=[0.48, 0.48],
bot_pct_lim=[0.0, 0.0],
rot_lim=[0.0, 0.0],
rand_flip=False,
is_train=False),
dict(
type='PointsRangeFilter',
point_cloud_range=[-54.0, -54.0, -5.0, 54.0, 54.0, 3.0]),
......@@ -332,6 +271,8 @@ train_dataloader = dict(
num_workers=4,
persistent_workers=True,
sampler=dict(type='DefaultSampler', shuffle=True),
dataset=dict(
type='CBGSDataset',
dataset=dict(
type=dataset_type,
data_root=data_root,
......@@ -341,14 +282,14 @@ train_dataloader = dict(
modality=input_modality,
test_mode=False,
data_prefix=data_prefix,
use_valid_flag=True,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d='LiDAR',
backend_args=backend_args))
box_type_3d='LiDAR')))
val_dataloader = dict(
batch_size=1,
num_workers=0,
# persistent_workers=True,
num_workers=4,
persistent_workers=True,
drop_last=False,
sampler=dict(type='DefaultSampler', shuffle=False),
dataset=dict(
......@@ -376,55 +317,68 @@ vis_backends = [dict(type='LocalVisBackend')]
visualizer = dict(
type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
# learning rate
lr = 0.0001
param_scheduler = [
# learning rate scheduler
# During the first 8 epochs, learning rate increases from 0 to lr * 10
# during the next 12 epochs, learning rate decreases from lr * 10 to
# lr * 1e-4
dict(
type='LinearLR',
start_factor=0.33333333,
by_epoch=False,
type='CosineAnnealingLR',
T_max=8,
eta_min=lr * 10,
begin=0,
end=500),
end=8,
by_epoch=True,
convert_to_iter_based=True),
dict(
type='CosineAnnealingLR',
begin=0,
T_max=6,
end=6,
T_max=12,
eta_min=lr * 1e-4,
begin=8,
end=20,
by_epoch=True,
eta_min_ratio=1e-3),
convert_to_iter_based=True),
# momentum scheduler
# During the first 8 epochs, momentum increases from 1 to 0.85 / 0.95
# During the first 8 epochs, momentum increases from 0 to 0.85 / 0.95
# during the next 12 epochs, momentum increases from 0.85 / 0.95 to 1
dict(
type='CosineAnnealingMomentum',
T_max=8,
eta_min=0.85 / 0.95,
begin=0,
end=2.4,
end=8,
by_epoch=True,
convert_to_iter_based=True),
dict(
type='CosineAnnealingMomentum',
T_max=12,
eta_min=1,
begin=2.4,
end=6,
begin=8,
end=20,
by_epoch=True,
convert_to_iter_based=True)
]
# runtime settings
train_cfg = dict(by_epoch=True, max_epochs=6, val_interval=6)
train_cfg = dict(by_epoch=True, max_epochs=20, val_interval=5)
val_cfg = dict()
test_cfg = dict()
optim_wrapper = dict(
type='OptimWrapper',
optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.01),
optimizer=dict(type='AdamW', lr=lr, weight_decay=0.01),
clip_grad=dict(max_norm=35, norm_type=2))
# Default setting for scaling LR automatically
# - `enable` means enable scaling LR automatically
# or not by default.
# - `base_batch_size` = (4 GPUs) x (4 samples per GPU).
auto_scale_lr = dict(enable=False, base_batch_size=16)
# - `base_batch_size` = (8 GPUs) x (4 samples per GPU).
auto_scale_lr = dict(enable=False, base_batch_size=32)
log_processor = dict(window_size=50)
default_hooks = dict(
logger=dict(type='LoggerHook', interval=50),
checkpoint=dict(type='CheckpointHook', interval=5))
custom_hooks = [dict(type='DisableObjectSampleHook', disable_after_epoch=15)]
......@@ -2,6 +2,8 @@
from unittest import TestCase
from unittest.mock import Mock
from mmengine.dataset import BaseDataset
from mmdet3d.datasets.transforms import ObjectSample
from mmdet3d.engine.hooks import DisableObjectSampleHook
......@@ -10,7 +12,7 @@ class TestDisableObjectSampleHook(TestCase):
runner = Mock()
runner.train_dataloader = Mock()
runner.train_dataloader.dataset = Mock()
runner.train_dataloader.dataset = Mock(spec=BaseDataset)
runner.train_dataloader.dataset.pipeline = Mock()
runner.train_dataloader._DataLoader__initialized = True
runner.train_dataloader.dataset.pipeline.transforms = [
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment