Commit a9dc86e9 authored by lishj6's avatar lishj6 🏸
Browse files

init_0905

parent 18eda5c1
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <THC/THCAtomics.cuh>
#include <iostream>
#include <stdlib.h>
__device__ float bilinear_sampling(
const float *&bottom_data, const int &height, const int &width,
const int &num_embeds, const float &h_im, const float &w_im,
const int &base_ptr
) {
const int h_low = floorf(h_im);
const int w_low = floorf(w_im);
const int h_high = h_low + 1;
const int w_high = w_low + 1;
const float lh = h_im - h_low;
const float lw = w_im - w_low;
const float hh = 1 - lh, hw = 1 - lw;
const int w_stride = num_embeds;
const int h_stride = width * w_stride;
const int h_low_ptr_offset = h_low * h_stride;
const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
const int w_low_ptr_offset = w_low * w_stride;
const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
float v1 = 0;
if (h_low >= 0 && w_low >= 0) {
const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
v1 = bottom_data[ptr1];
}
float v2 = 0;
if (h_low >= 0 && w_high <= width - 1) {
const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
v2 = bottom_data[ptr2];
}
float v3 = 0;
if (h_high <= height - 1 && w_low >= 0) {
const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
v3 = bottom_data[ptr3];
}
float v4 = 0;
if (h_high <= height - 1 && w_high <= width - 1) {
const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
v4 = bottom_data[ptr4];
}
const float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
const float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
return val;
}
__device__ void bilinear_sampling_grad(
const float *&bottom_data, const float &weight,
const int &height, const int &width,
const int &num_embeds, const float &h_im, const float &w_im,
const int &base_ptr,
const float &grad_output,
float *&grad_mc_ms_feat, float *grad_sampling_location, float *grad_weights) {
const int h_low = floorf(h_im);
const int w_low = floorf(w_im);
const int h_high = h_low + 1;
const int w_high = w_low + 1;
const float lh = h_im - h_low;
const float lw = w_im - w_low;
const float hh = 1 - lh, hw = 1 - lw;
const int w_stride = num_embeds;
const int h_stride = width * w_stride;
const int h_low_ptr_offset = h_low * h_stride;
const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
const int w_low_ptr_offset = w_low * w_stride;
const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
const float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
const float top_grad_mc_ms_feat = grad_output * weight;
float grad_h_weight = 0, grad_w_weight = 0;
float v1 = 0;
if (h_low >= 0 && w_low >= 0) {
const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
v1 = bottom_data[ptr1];
grad_h_weight -= hw * v1;
grad_w_weight -= hh * v1;
// atomicAdd(grad_mc_ms_feat + ptr1, w1 * top_grad_mc_ms_feat);
#ifdef __gfx936__
__builtin_amdgcn_global_atomic_fadd_f32(grad_mc_ms_feat + ptr1, w1 * top_grad_mc_ms_feat);
#else
atomicAdd(grad_mc_ms_feat + ptr1, w1 * top_grad_mc_ms_feat);
#endif
}
float v2 = 0;
if (h_low >= 0 && w_high <= width - 1) {
const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
v2 = bottom_data[ptr2];
grad_h_weight -= lw * v2;
grad_w_weight += hh * v2;
// atomicAdd(grad_mc_ms_feat + ptr2, w2 * top_grad_mc_ms_feat);
#ifdef __gfx936__
__builtin_amdgcn_global_atomic_fadd_f32(grad_mc_ms_feat + ptr2, w2 * top_grad_mc_ms_feat);
#else
atomicAdd(grad_mc_ms_feat + ptr2, w2 * top_grad_mc_ms_feat);
#endif
}
float v3 = 0;
if (h_high <= height - 1 && w_low >= 0) {
const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
v3 = bottom_data[ptr3];
grad_h_weight += hw * v3;
grad_w_weight -= lh * v3;
// atomicAdd(grad_mc_ms_feat + ptr3, w3 * top_grad_mc_ms_feat);
#ifdef __gfx936__
__builtin_amdgcn_global_atomic_fadd_f32(grad_mc_ms_feat + ptr3, w3 * top_grad_mc_ms_feat);
#else
atomicAdd(grad_mc_ms_feat + ptr3, w3 * top_grad_mc_ms_feat);
#endif
}
float v4 = 0;
if (h_high <= height - 1 && w_high <= width - 1) {
const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
v4 = bottom_data[ptr4];
grad_h_weight += lw * v4;
grad_w_weight += lh * v4;
// atomicAdd(grad_mc_ms_feat + ptr4, w4 * top_grad_mc_ms_feat);
#ifdef __gfx936__
__builtin_amdgcn_global_atomic_fadd_f32(grad_mc_ms_feat + ptr4, w4 * top_grad_mc_ms_feat);
#else
atomicAdd(grad_mc_ms_feat + ptr4, w4 * top_grad_mc_ms_feat);
#endif
}
const float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
// atomicAdd(grad_weights, grad_output * val);
// atomicAdd(grad_sampling_location, width * grad_w_weight * top_grad_mc_ms_feat);
// atomicAdd(grad_sampling_location + 1, height * grad_h_weight * top_grad_mc_ms_feat);
#ifdef __gfx936__
__builtin_amdgcn_global_atomic_fadd_f32(grad_weights, grad_output * val);
__syncthreads();
__builtin_amdgcn_global_atomic_fadd_f32(grad_sampling_location, width * grad_w_weight * top_grad_mc_ms_feat);
__builtin_amdgcn_global_atomic_fadd_f32(grad_sampling_location + 1, height * grad_h_weight * top_grad_mc_ms_feat);
#else
atomicAdd(grad_weights, grad_output * val);
atomicAdd(grad_sampling_location, width * grad_w_weight * top_grad_mc_ms_feat);
atomicAdd(grad_sampling_location + 1, height * grad_h_weight * top_grad_mc_ms_feat);
#endif
}
__global__ void deformable_aggregation_kernel(
const int num_kernels,
float* output,
const float* mc_ms_feat,
const int* spatial_shape,
const int* scale_start_index,
const float* sample_location,
const float* weights,
int batch_size,
int num_cams,
int num_feat,
int num_embeds,
int num_scale,
int num_anchors,
int num_pts,
int num_groups
) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= num_kernels) return;
const float weight = *(weights + idx / (num_embeds / num_groups));
const int channel_index = idx % num_embeds;
idx /= num_embeds;
const int scale_index = idx % num_scale;
idx /= num_scale;
const int cam_index = idx % num_cams;
idx /= num_cams;
const int pts_index = idx % num_pts;
idx /= num_pts;
int anchor_index = idx % num_anchors;
idx /= num_anchors;
const int batch_index = idx % batch_size;
idx /= batch_size;
anchor_index = batch_index * num_anchors + anchor_index;
const int loc_offset = ((anchor_index * num_pts + pts_index) * num_cams + cam_index) << 1;
const float loc_w = sample_location[loc_offset];
if (loc_w <= 0 || loc_w >= 1) return;
const float loc_h = sample_location[loc_offset + 1];
if (loc_h <= 0 || loc_h >= 1) return;
int cam_scale_index = cam_index * num_scale + scale_index;
const int value_offset = (batch_index * num_feat + scale_start_index[cam_scale_index]) * num_embeds + channel_index;
cam_scale_index = cam_scale_index << 1;
const int h = spatial_shape[cam_scale_index];
const int w = spatial_shape[cam_scale_index + 1];
const float h_im = loc_h * h - 0.5;
const float w_im = loc_w * w - 0.5;
// atomicAdd(
// output + anchor_index * num_embeds + channel_index,
// bilinear_sampling(mc_ms_feat, h, w, num_embeds, h_im, w_im, value_offset) * weight
// );
#ifdef __gfx936__
__builtin_amdgcn_global_atomic_fadd_f32(output + anchor_index * num_embeds + channel_index, bilinear_sampling(mc_ms_feat, h, w, num_embeds, h_im, w_im, value_offset) * weight);
#else
atomicAdd(output + anchor_index * num_embeds + channel_index, bilinear_sampling(mc_ms_feat, h, w, num_embeds, h_im, w_im, value_offset) * weight);
#endif
}
__global__ void deformable_aggregation_grad_kernel(
const int num_kernels,
const float* mc_ms_feat,
const int* spatial_shape,
const int* scale_start_index,
const float* sample_location,
const float* weights,
const float* grad_output,
float* grad_mc_ms_feat,
float* grad_sampling_location,
float* grad_weights,
int batch_size,
int num_cams,
int num_feat,
int num_embeds,
int num_scale,
int num_anchors,
int num_pts,
int num_groups
) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= num_kernels) return;
const int weights_ptr = idx / (num_embeds / num_groups);
const int channel_index = idx % num_embeds;
idx /= num_embeds;
const int scale_index = idx % num_scale;
idx /= num_scale;
const int cam_index = idx % num_cams;
idx /= num_cams;
const int pts_index = idx % num_pts;
idx /= num_pts;
int anchor_index = idx % num_anchors;
idx /= num_anchors;
const int batch_index = idx % batch_size;
idx /= batch_size;
anchor_index = batch_index * num_anchors + anchor_index;
const int loc_offset = ((anchor_index * num_pts + pts_index) * num_cams + cam_index) << 1;
const float loc_w = sample_location[loc_offset];
if (loc_w <= 0 || loc_w >= 1) return;
const float loc_h = sample_location[loc_offset + 1];
if (loc_h <= 0 || loc_h >= 1) return;
const float grad = grad_output[anchor_index*num_embeds + channel_index];
int cam_scale_index = cam_index * num_scale + scale_index;
const int value_offset = (batch_index * num_feat + scale_start_index[cam_scale_index]) * num_embeds + channel_index;
cam_scale_index = cam_scale_index << 1;
const int h = spatial_shape[cam_scale_index];
const int w = spatial_shape[cam_scale_index + 1];
const float h_im = loc_h * h - 0.5;
const float w_im = loc_w * w - 0.5;
/* atomicAdd( */
/* output + anchor_index * num_embeds + channel_index, */
/* bilinear_sampling(mc_ms_feat, h, w, num_embeds, h_im, w_im, value_offset) * weight */
/* ); */
const float weight = weights[weights_ptr];
float *grad_weights_ptr = grad_weights + weights_ptr;
float *grad_location_ptr = grad_sampling_location + loc_offset;
bilinear_sampling_grad(
mc_ms_feat, weight, h, w, num_embeds, h_im, w_im,
value_offset,
grad,
grad_mc_ms_feat, grad_location_ptr, grad_weights_ptr
);
}
void deformable_aggregation(
float* output,
const float* mc_ms_feat,
const int* spatial_shape,
const int* scale_start_index,
const float* sample_location,
const float* weights,
int batch_size,
int num_cams,
int num_feat,
int num_embeds,
int num_scale,
int num_anchors,
int num_pts,
int num_groups
) {
const int num_kernels = batch_size * num_pts * num_embeds * num_anchors * num_cams * num_scale;
deformable_aggregation_kernel
<<<(int)ceil(((double)num_kernels/128)), 128>>>(
num_kernels, output,
mc_ms_feat, spatial_shape, scale_start_index, sample_location, weights,
batch_size, num_cams, num_feat, num_embeds, num_scale, num_anchors, num_pts, num_groups
);
}
void deformable_aggregation_grad(
const float* mc_ms_feat,
const int* spatial_shape,
const int* scale_start_index,
const float* sample_location,
const float* weights,
const float* grad_output,
float* grad_mc_ms_feat,
float* grad_sampling_location,
float* grad_weights,
int batch_size,
int num_cams,
int num_feat,
int num_embeds,
int num_scale,
int num_anchors,
int num_pts,
int num_groups
) {
const int num_kernels = batch_size * num_pts * num_embeds * num_anchors * num_cams * num_scale;
deformable_aggregation_grad_kernel
<<<(int)ceil(((double)num_kernels/128)), 128>>>(
num_kernels,
mc_ms_feat, spatial_shape, scale_start_index, sample_location, weights,
grad_output, grad_mc_ms_feat, grad_sampling_location, grad_weights,
batch_size, num_cams, num_feat, num_embeds, num_scale, num_anchors, num_pts, num_groups
);
}
// !!! This is a file automatically generated by hipify!!!
#include <ATen/dtk_macros.h>
#include <torch/extension.h>
#include <ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h>
void deformable_aggregation(
float* output,
const float* mc_ms_feat,
const int* spatial_shape,
const int* scale_start_index,
const float* sample_location,
const float* weights,
int batch_size,
int num_cams,
int num_feat,
int num_embeds,
int num_scale,
int num_anchors,
int num_pts,
int num_groups
);
/* feat: bs, num_feat, c */
/* _spatial_shape: cam, scale, 2 */
/* _scale_start_index: cam, scale */
/* _sampling_location: bs, anchor, pts, cam, 2 */
/* _weights: bs, anchor, pts, cam, scale, group */
/* output: bs, anchor, c */
/* kernel: bs, anchor, pts, c */
at::Tensor deformable_aggregation_forward(
const at::Tensor &_mc_ms_feat,
const at::Tensor &_spatial_shape,
const at::Tensor &_scale_start_index,
const at::Tensor &_sampling_location,
const at::Tensor &_weights
) {
at::DeviceGuard guard(_mc_ms_feat.device());
const at::hip::OptionalHIPGuardMasqueradingAsCUDA device_guard(device_of(_mc_ms_feat));
int batch_size = _mc_ms_feat.size(0);
int num_feat = _mc_ms_feat.size(1);
int num_embeds = _mc_ms_feat.size(2);
int num_cams = _spatial_shape.size(0);
int num_scale = _spatial_shape.size(1);
int num_anchors = _sampling_location.size(1);
int num_pts = _sampling_location.size(2);
int num_groups = _weights.size(5);
const float* mc_ms_feat = _mc_ms_feat.data_ptr<float>();
const int* spatial_shape = _spatial_shape.data_ptr<int>();
const int* scale_start_index = _scale_start_index.data_ptr<int>();
const float* sampling_location = _sampling_location.data_ptr<float>();
const float* weights = _weights.data_ptr<float>();
auto output = at::zeros({batch_size, num_anchors, num_embeds}, _mc_ms_feat.options());
deformable_aggregation(
output.data_ptr<float>(),
mc_ms_feat, spatial_shape, scale_start_index, sampling_location, weights,
batch_size, num_cams, num_feat, num_embeds, num_scale, num_anchors, num_pts, num_groups
);
return output;
}
void deformable_aggregation_grad(
const float* mc_ms_feat,
const int* spatial_shape,
const int* scale_start_index,
const float* sample_location,
const float* weights,
const float* grad_output,
float* grad_mc_ms_feat,
float* grad_sampling_location,
float* grad_weights,
int batch_size,
int num_cams,
int num_feat,
int num_embeds,
int num_scale,
int num_anchors,
int num_pts,
int num_groups
);
void deformable_aggregation_backward(
const at::Tensor &_mc_ms_feat,
const at::Tensor &_spatial_shape,
const at::Tensor &_scale_start_index,
const at::Tensor &_sampling_location,
const at::Tensor &_weights,
const at::Tensor &_grad_output,
at::Tensor &_grad_mc_ms_feat,
at::Tensor &_grad_sampling_location,
at::Tensor &_grad_weights
) {
at::DeviceGuard guard(_mc_ms_feat.device());
const at::hip::OptionalHIPGuardMasqueradingAsCUDA device_guard(device_of(_mc_ms_feat));
int batch_size = _mc_ms_feat.size(0);
int num_feat = _mc_ms_feat.size(1);
int num_embeds = _mc_ms_feat.size(2);
int num_cams = _spatial_shape.size(0);
int num_scale = _spatial_shape.size(1);
int num_anchors = _sampling_location.size(1);
int num_pts = _sampling_location.size(2);
int num_groups = _weights.size(5);
const float* mc_ms_feat = _mc_ms_feat.data_ptr<float>();
const int* spatial_shape = _spatial_shape.data_ptr<int>();
const int* scale_start_index = _scale_start_index.data_ptr<int>();
const float* sampling_location = _sampling_location.data_ptr<float>();
const float* weights = _weights.data_ptr<float>();
const float* grad_output = _grad_output.data_ptr<float>();
float* grad_mc_ms_feat = _grad_mc_ms_feat.data_ptr<float>();
float* grad_sampling_location = _grad_sampling_location.data_ptr<float>();
float* grad_weights = _grad_weights.data_ptr<float>();
deformable_aggregation_grad(
mc_ms_feat, spatial_shape, scale_start_index, sampling_location, weights,
grad_output, grad_mc_ms_feat, grad_sampling_location, grad_weights,
batch_size, num_cams, num_feat, num_embeds, num_scale, num_anchors, num_pts, num_groups
);
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def(
"deformable_aggregation_forward",
&deformable_aggregation_forward,
"deformable_aggregation_forward"
);
m.def(
"deformable_aggregation_backward",
&deformable_aggregation_backward,
"deformable_aggregation_backward"
);
}
<div align="center">
# Sparse4D: Sparse-based End-to-end Multi-view Temporal Perception
</div>
> [Github](https://github.com/HorizonRobotics/Sparse4D) \
> [Sparse4D v1: Multi-view 3D Object Detection with Sparse Spatial-Temporal Fusion](https://arxiv.org/abs/2211.10581) \
> [Sparse4D v2: Recurrent Temporal Fusion with Sparse Model](https://arxiv.org/abs/2305.14018) \
> [Sparse4D v3: Advancing End-to-End 3D Detection and Tracking](https://arxiv.org/abs/2311.11722) \
> [SparseDrive: End-to-End Autonomous Driving via Sparse Scene Representation](https://arxiv.org/abs/2405.19620) \
> [Chinese Interpretation of the Papers](https://zhuanlan.zhihu.com/p/637096473)
**【2024/06】Our follow-up project, SparseDrive (an end-to-end planning model based on the sparse framework), has been released!!! [arxiv](https://arxiv.org/abs/2405.19620) & [github](https://github.com/swc-17/SparseDrive).**
## Overall Architecture
<center>
<img style="border-radius: 0.3125em;
box-shadow: 0 2px 4px 0 rgba(34,36,38,.12),0 2px 10px 0 rgba(34,36,38,.08);"
src="resources/sparse4d_architecture.jpg" width="1000">
<br>
<div style="color:orange; border-bottom: 1px solid #d9d9d9;
display: inline-block;
color: #999;
padding: 2px;">Overall Framework of Sparse4D, which conforms to an encoder-decoder structure. The inputs mainly consists of three components: multi-view images, newly initialized instances, propagated instances from previous frame. The output is the refined instances (3D anchor boxes and corresponding features), serve as the perception results for the current frame. Additionally, a subset of these refined instances is selected and propagated to the next frame.</div>
</center>
<center>
<img style="border-radius: 0.3125em;
box-shadow: 0 2px 4px 0 rgba(34,36,38,.12),0 2px 10px 0 rgba(34,36,38,.08);"
src="resources/efficient_deformable_aggregation.jpg" width="1000">
<br>
<div style="color:orange; border-bottom: 1px solid #d9d9d9;
display: inline-block;
color: #999;
padding: 2px;"> Illustration of our Efficient Deformable Aggregation Module. (a) The basic pipeline: we first generate multiple 3D key points inside 3D anchor, then sampling multi-scale/view image feature for each keypoint, and fuse these feature with predicted weight. (b) The parallel implementation: to further improve speed and reduce memory cost, we achieve a parallel implementation, where feature sampling and multi-view/scale weighted sum are combined as a CUDA operation. Our CUDA implementation supports handling different feature resolutions from different views. </div>
</center>
## nuScenes Benchmark
### Results on Validation Split
These experiments were conducted using 8 RTX 3090 GPUs with 24 GB memory.
|model | backbone |pretrain| img size | Epoch | Traning | FPS | NDS | mAP | AMOTA |AMOTP |IDS| config | ckpt | log |
| :----: | :---: | :---: | :---: | :---: | :---:| :---:|:---:|:---: | :---: | :----: | :----: | :---: | :----: | :----: |
|Sparse4D-T4 |Res101|[FCOS3D](https://github.com/linxuewu/Sparse4D/releases/download/v0.0/fcos3d.pth)|640x1600|24|2Day5H|2.9|0.5438|0.4409|-|-|-|[cfg](https://github.com/linxuewu/Sparse4D/blob/v2.0/projects/configs/sparse4d_r101_H4.py)|[ckpt](https://github.com/linxuewu/Sparse4D/releases/download/v0.0/sparse4dv1_r101_H4_release.pth)|[log](https://github.com/linxuewu/Sparse4D/releases/download/v0.0/sparse4d.log)|
|Sparse4Dv2|Res50|[ImageNet]()|256x704| 100 |15H | 20.3 |0.5384|0.4392|-|-|-|[cfg](https://github.com/linxuewu/Sparse4D/blob/v2.0/projects/configs/sparse4dv2_r50_HInf_256x704.py)|[ckpt](https://github.com/linxuewu/Sparse4D/releases/download/v0.0/sparse4dv2_r50_HInf_256x704.pth)|[log](https://github.com/linxuewu/Sparse4D/releases/download/v0.0/sparse4dv2_r50_HInf_256x704.log.json)|
|Sparse4Dv2|Res101|[nuImage](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/cascade_mask_rcnn_r101_fpn_1x_nuim/cascade_mask_rcnn_r101_fpn_1x_nuim_20201024_134804-45215b1e.pth)|512x1408| 100 |2Day | 8.4 |0.5939|0.5051|-|-|-|-|-|-|
|Sparse4Dv3|Res50|[ImageNet]()|256x704| 100 |22H | 19.8 |0.5637|0.4646|0.477|1.167|456|[cfg](https://github.com/HorizonRobotics/Sparse4D/blob/main/projects/configs/sparse4dv3_temporal_r50_1x8_bs6_256x704.py)|[ckpt](https://github.com/HorizonRobotics/Sparse4D/releases/download/v3.0/sparse4dv3_r50.pth)|[log](https://github.com/HorizonRobotics/Sparse4D/releases/download/v3.0/sparse4dv3_r50.log)
|Sparse4Dv3|Res101|[nuImage](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/cascade_mask_rcnn_r101_fpn_1x_nuim/cascade_mask_rcnn_r101_fpn_1x_nuim_20201024_134804-45215b1e.pth)|512x1408| 100 |2Day | 8.2 |0.623|0.537|0.567|1.027|557|-|-|-|
### Results on Test Split
|model| backbone | img size | NDS | mAP |mATE| mASE | mAOE |mAVE| mAAE | AMOTA |AMOTP |IDS|
| :---: | :---: | :---: | :---: | :---:|:---:|:---: | :---: | :----: | :----: | :----: | :----: | :----: |
|Sparse4D-T4|[VoV-99](https://huggingface.co/Yuxin-CV/EVA-02/blob/main/eva02/det/eva02_L_coco_det_sys_o365.pth)|640x1600|0.595|0.511|0.533|0.263|0.369|0.317|0.124|-|-|-|
|Sparse4Dv2|[VoV-99](https://huggingface.co/Yuxin-CV/EVA-02/blob/main/eva02/det/eva02_L_coco_det_sys_o365.pth)|640x1600|0.638|0.556|0.462|0.238|0.328|0.264|0.115|-|-|-|
|Sparse4Dv3|[VoV-99](https://huggingface.co/Yuxin-CV/EVA-02/blob/main/eva02/det/eva02_L_coco_det_sys_o365.pth)|640x1600|0.656|0.570|0.412|0.236|0.312|0.210|0.117|0.574|0.970|669|
|Sparse4Dv3-offline|[EVA02-large](https://huggingface.co/Yuxin-CV/EVA-02/blob/main/eva02/det/eva02_L_coco_det_sys_o365.pth)|640x1600|0.719|0.668|0.346|0.234|0.279|0.142|0.145|0.677|0.761|514|
PS: In the nuscenes leaderboard, Sparse4Dv3 selected external data=True because the eva02-large pretraining utilized imagenet, object365, and coco, as well as supervised by CLIP. Therefore, we consider using the model pre-trained with eva02 as incorporating external data. **However, we did not use external 3D detection data for training.** This clarification is provided to facilitate fair comparisons.
## Quick Start
[Quick Start](docs/quick_start.md)
## Citation
```
@misc{2311.11722,
Author = {Xuewu Lin and Zixiang Pei and Tianwei Lin and Lichao Huang and Zhizhong Su},
Title = {Sparse4D v3: Advancing End-to-End 3D Detection and Tracking},
Year = {2023},
Eprint = {arXiv:2311.11722},
}
@misc{2305.14018,
Author = {Xuewu Lin and Tianwei Lin and Zixiang Pei and Lichao Huang and Zhizhong Su},
Title = {Sparse4D v2: Recurrent Temporal Fusion with Sparse Model},
Year = {2023},
Eprint = {arXiv:2305.14018},
}
@misc{2211.10581,
Author = {Xuewu Lin and Tianwei Lin and Zixiang Pei and Lichao Huang and Zhizhong Su},
Title = {Sparse4D: Multi-view 3D Object Detection with Sparse Spatial-Temporal Fusion},
Year = {2022},
Eprint = {arXiv:2211.10581},
}
```
## Acknowledgement
- [BEVFormer](https://github.com/fundamentalvision/BEVFormer)
- [DETR3D](https://github.com/WangYueFt/detr3d)
- [mmdet3d](https://github.com/open-mmlab/mmdetection3d)
- [SOLOFusion](https://github.com/Divadi/SOLOFusion/tree/main/configs/solofusion)
- [StreamPETR](https://github.com/exiawsh/StreamPETR)
torch==1.13.0+cu116
torchvision==0.14.0+cu116
torchaudio==0.13.0+cu116
numpy==1.23.5
mmcv_full==1.7.1
mmdet==2.28.2
urllib3==1.26.16
pyquaternion==0.9.9
nuscenes-devkit==1.1.10
yapf==0.33.0
tensorboard==2.14.0
motmetrics==1.1.3
pandas==1.1.5
import numpy as np
from sklearn.cluster import KMeans
import mmcv
from projects.mmdet3d_plugin.core.box3d import *
def get_kmeans_anchor(
ann_file,
num_anchor=900,
detection_range=55,
output_file_name="nuscenes_kmeans900.npy",
verbose=False,
):
data = mmcv.load(ann_file, file_format="pkl")
gt_boxes = np.concatenate([x["gt_boxes"] for x in data["infos"]], axis=0)
distance = np.linalg.norm(gt_boxes[:, :3], axis=-1, ord=2)
mask = distance <= detection_range
gt_boxes = gt_boxes[mask]
clf = KMeans(n_clusters=num_anchor, verbose=verbose)
print("===========Starting kmeans, please wait.===========")
clf.fit(gt_boxes[:, [X, Y, Z]])
anchor = np.zeros((num_anchor, 11))
anchor[:, [X, Y, Z]] = clf.cluster_centers_
anchor[:, [W, L, H]] = np.log(gt_boxes[:, [W, L, H]].mean(axis=0))
anchor[:, COS_YAW] = 1
np.save(output_file_name, anchor)
print(f"===========Done! Save results to {output_file_name}.===========")
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="anchor kmeans")
parser.add_argument("--ann_file", type=str, required=True)
parser.add_argument("--num_anchor", type=int, default=900)
parser.add_argument("--detection_range", type=float, default=55)
parser.add_argument(
"--output_file_name", type=str, default="_nuscenes_kmeans900.npy"
)
parser.add_argument("--verbose", action="store_true")
args = parser.parse_args()
get_kmeans_anchor(
args.ann_file,
args.num_anchor,
args.detection_range,
args.output_file_name,
args.verbose,
)
# Copyright (c) OpenMMLab. All rights reserved.
import argparse
import time
import torch
from mmcv import Config
from mmcv.parallel import MMDataParallel
from mmcv.runner import load_checkpoint, wrap_fp16_model
import sys
sys.path.append(".")
from projects.mmdet3d_plugin.datasets.builder import build_dataloader
from projects.mmdet3d_plugin.datasets import custom_build_dataset
# from mmdet3d.datasets import build_dataloader, build_dataset
from mmdet.models import build_detector
from tools.fuse_conv_bn import fuse_module
def parse_args():
parser = argparse.ArgumentParser(description="MMDet benchmark a model")
parser.add_argument("config", help="test config file path")
parser.add_argument("--checkpoint", default=None, help="checkpoint file")
parser.add_argument("--samples", default=2000, help="samples to benchmark")
parser.add_argument(
"--log-interval", default=50, help="interval of logging"
)
parser.add_argument(
"--fuse-conv-bn",
action="store_true",
help="Whether to fuse conv and bn, this will slightly increase"
"the inference speed",
)
args = parser.parse_args()
return args
def get_max_memory(model):
device = getattr(model, "output_device", None)
mem = torch.cuda.max_memory_allocated(device=device)
mem_mb = torch.tensor(
[mem / (1024 * 1024)], dtype=torch.int, device=device
)
return mem_mb.item()
def main():
args = parse_args()
cfg = Config.fromfile(args.config)
# set cudnn_benchmark
if cfg.get("cudnn_benchmark", False):
torch.backends.cudnn.benchmark = True
cfg.model.pretrained = None
cfg.data.test.test_mode = True
# build the dataloader
# TODO: support multiple images per gpu (only minor changes are needed)
print(cfg.data.test)
dataset = custom_build_dataset(cfg.data.test)
data_loader = build_dataloader(
dataset,
samples_per_gpu=1,
workers_per_gpu=cfg.data.workers_per_gpu,
dist=False,
shuffle=False,
)
# build the model and load checkpoint
cfg.model.train_cfg = None
model = build_detector(cfg.model, test_cfg=cfg.get("test_cfg"))
fp16_cfg = cfg.get("fp16", None)
if fp16_cfg is not None:
wrap_fp16_model(model)
if args.checkpoint is not None:
load_checkpoint(model, args.checkpoint, map_location="cpu")
if args.fuse_conv_bn:
model = fuse_module(model)
model = MMDataParallel(model, device_ids=[0])
model.eval()
# the first several iterations may be very slow so skip them
num_warmup = 5
pure_inf_time = 0
# benchmark with several samples and take the average
max_memory = 0
for i, data in enumerate(data_loader):
# torch.cuda.synchronize()
with torch.no_grad():
start_time = time.perf_counter()
model(return_loss=False, rescale=True, **data)
torch.cuda.synchronize()
elapsed = time.perf_counter() - start_time
max_memory = max(max_memory, get_max_memory(model))
if i >= num_warmup:
pure_inf_time += elapsed
if (i + 1) % args.log_interval == 0:
fps = (i + 1 - num_warmup) / pure_inf_time
print(
f"Done image [{i + 1:<3}/ {args.samples}], "
f"fps: {fps:.1f} img / s, "
f"gpu mem: {max_memory} M"
)
if (i + 1) == args.samples:
pure_inf_time += elapsed
fps = (i + 1 - num_warmup) / pure_inf_time
print(f"Overall fps: {fps:.1f} img / s")
break
if __name__ == "__main__":
main()
#!/usr/bin/env bash
CONFIG=$1
CHECKPOINT=$2
GPUS=$3
PORT=${PORT:-29610}
PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
python3 -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
$(dirname "$0")/test.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4}
#!/usr/bin/env bash
CONFIG=$1
GPUS=$2
PORT=${PORT:-28651}
# export PYTORCH_MIOPEN_SUGGEST_NHWC=1 #.to(memory_format=torch.channels_last)
# export MIOPEN_FIND_MODE=1
export TORCHINDUCTOR_LAYOUT_OPTIMIZATION=1
export PYTORCH_MIOPEN_SUGGEST_NHWC=1
export TORCHINDUCTOR_COORDINATE_DESCENT_TUNING=1
export LD_LIBRARY_PATH=/opt/rocblas-install/lib:$LD_LIBRARY_PATH
export NCCL_ALGO=Ring
export HSA_FORCE_FINE_GRAIN_PCIE=1
export NCCL_TOPO_FILE=null
export NCCL_RINGS="N0 0 7 6 5 4 3 2 1 N0|N1 1 2 3 4 5 6 7 0 N1|N2 2 1 0 7 6 5 4 3 N2|N3 3 4 5 6 7 0 1 2 N3|N4 4 3 2 1 0 7 6 5 N4|N5 5 6 7 0 1 2 3 4 N5|N6 6 5 4 3 2 1 0 7 N6|N7 7 0 1 2 3 4 5 6 N7"
export HIP_VISIBLE_DEVICES=1
PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
torchrun --nproc_per_node=$GPUS --master_port=$PORT \
$(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3}
#!/usr/bin/env bash
CONFIG=$1
GPUS=$2
PORT=${PORT:-28650}
export NCCL_ALGO=Ring
export HSA_FORCE_FINE_GRAIN_PCIE=1
export NCCL_TOPO_FILE=null
export NCCL_RINGS="N0 0 7 6 5 4 3 2 1 N0|N1 1 2 3 4 5 6 7 0 N1|N2 2 1 0 7 6 5 4 3 N2|N3 3 4 5 6 7 0 1 2 N3|N4 4 3 2 1 0 7 6 5 N4|N5 5 6 7 0 1 2 3 4 N5|N6 6 5 4 3 2 1 0 7 N6|N7 7 0 1 2 3 4 5 6 N7"
# PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
# torchrun --nproc_per_node=$GPUS --master_port=$PORT \
# $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3}
PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
torchrun --nnodes 2 --nproc_per_node 8 --node_rank 0 --master_addr=10.16.6.16 --master_port 4519 \
$(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3} --deterministic
# 设置主节点地址(单机可省略)
# export MASTER_ADDR="30.149.248.7"
# export GPU_FLUSH_ON_EXECUTION=1
export PYTORCH_MIOPEN_SUGGEST_NHWC=1 #.to(memory_format=torch.channels_last)
export MIOPEN_FIND_MODE=1
export NCCL_ALGO=Ring
export HSA_FORCE_FINE_GRAIN_PCIE=1
# export LD_LIBRARY_PATH=/opt/rocblas-install/lib:$LD_LIBRARY_PATH
export NCCL_TOPO_FILE=null
# export NCCL_DEBUG=INFO
export NCCL_RINGS="N0 0 7 6 5 4 3 2 1 N0|N1 1 2 3 4 5 6 7 0 N1|N2 2 1 0 7 6 5 4 3 N2|N3 3 4 5 6 7 0 1 2 N3|N4 4 3 2 1 0 7 6 5 N4|N5 5 6 7 0 1 2 3 4 N5|N6 6 5 4 3 2 1 0 7 N6|N7 7 0 1 2 3 4 5 6 N7"
# PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
# torchrun --nnodes 2 --nproc_per_node 8 --node_rank 0 --master_addr=10.16.6.16 --master_port 9528 \
# $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3} --deterministic
# --enable-profiler \
#--to_channels_last \
#!/usr/bin/env bash
CONFIG=$1
GPUS=$2
PORT=${PORT:-28650}
export NCCL_ALGO=Ring
export HSA_FORCE_FINE_GRAIN_PCIE=1
export NCCL_TOPO_FILE=null
export NCCL_RINGS="N0 0 7 6 5 4 3 2 1 N0|N1 1 2 3 4 5 6 7 0 N1|N2 2 1 0 7 6 5 4 3 N2|N3 3 4 5 6 7 0 1 2 N3|N4 4 3 2 1 0 7 6 5 N4|N5 5 6 7 0 1 2 3 4 N5|N6 6 5 4 3 2 1 0 7 N6|N7 7 0 1 2 3 4 5 6 N7"
# PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
# torchrun --nproc_per_node=$GPUS --master_port=$PORT \
# $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3}
PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
torchrun --nnodes 2 --nproc_per_node 8 --node_rank 1 --master_addr=10.16.6.16 --master_port 4519 \
$(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3} --deterministic
\ No newline at end of file
# Copyright (c) OpenMMLab. All rights reserved.
import argparse
import torch
from torch import nn as nn
from mmcv.runner import save_checkpoint
from mmdet.apis import init_detector
def fuse_conv_bn(conv, bn):
"""During inference, the functionary of batch norm layers is turned off but
only the mean and var alone channels are used, which exposes the chance to
fuse it with the preceding conv layers to save computations and simplify
network structures."""
conv_w = conv.weight
conv_b = (
conv.bias
if conv.bias is not None
else torch.zeros_like(bn.running_mean)
)
factor = bn.weight / torch.sqrt(bn.running_var + bn.eps)
conv.weight = nn.Parameter(
conv_w * factor.reshape([conv.out_channels, 1, 1, 1])
)
conv.bias = nn.Parameter((conv_b - bn.running_mean) * factor + bn.bias)
return conv
def fuse_module(m):
last_conv = None
last_conv_name = None
for name, child in m.named_children():
if isinstance(child, (nn.BatchNorm2d, nn.SyncBatchNorm)):
if last_conv is None: # only fuse BN that is after Conv
continue
fused_conv = fuse_conv_bn(last_conv, child)
m._modules[last_conv_name] = fused_conv
# To reduce changes, set BN as Identity instead of deleting it.
m._modules[name] = nn.Identity()
last_conv = None
elif isinstance(child, nn.Conv2d):
last_conv = child
last_conv_name = name
else:
fuse_module(child)
return m
def parse_args():
parser = argparse.ArgumentParser(
description="fuse Conv and BN layers in a model"
)
parser.add_argument("config", help="config file path")
parser.add_argument("checkpoint", help="checkpoint file path")
parser.add_argument("out", help="output path of the converted model")
args = parser.parse_args()
return args
def main():
args = parse_args()
# build the model from a config file and a checkpoint file
model = init_detector(args.config, args.checkpoint)
# fuse conv and bn layers of the model
fused_model = fuse_module(model)
save_checkpoint(fused_model, args.out)
if __name__ == "__main__":
main()
# Copyright (c) OpenMMLab. All rights reserved.
import os
from collections import OrderedDict
from os import path as osp
from typing import List, Tuple, Union
import numpy as np
from nuscenes.nuscenes import NuScenes
import mmcv
from pyquaternion import Quaternion
NameMapping = {
"movable_object.barrier": "barrier",
"vehicle.bicycle": "bicycle",
"vehicle.bus.bendy": "bus",
"vehicle.bus.rigid": "bus",
"vehicle.car": "car",
"vehicle.construction": "construction_vehicle",
"vehicle.motorcycle": "motorcycle",
"human.pedestrian.adult": "pedestrian",
"human.pedestrian.child": "pedestrian",
"human.pedestrian.construction_worker": "pedestrian",
"human.pedestrian.police_officer": "pedestrian",
"movable_object.trafficcone": "traffic_cone",
"vehicle.trailer": "trailer",
"vehicle.truck": "truck",
}
def create_nuscenes_infos(
root_path, info_prefix, version="v1.0-trainval", max_sweeps=10
):
"""Create info file of nuscene dataset.
Given the raw data, generate its related info file in pkl format.
Args:
root_path (str): Path of the data root.
info_prefix (str): Prefix of the info file to be generated.
version (str, optional): Version of the data.
Default: 'v1.0-trainval'.
max_sweeps (int, optional): Max number of sweeps.
Default: 10.
"""
from nuscenes.nuscenes import NuScenes
nusc = NuScenes(version=version, dataroot=root_path, verbose=True)
from nuscenes.utils import splits
available_vers = ["v1.0-trainval", "v1.0-test", "v1.0-mini"]
assert version in available_vers
if version == "v1.0-trainval":
train_scenes = splits.train
val_scenes = splits.val
elif version == "v1.0-test":
train_scenes = splits.test
val_scenes = []
elif version == "v1.0-mini":
train_scenes = splits.mini_train
val_scenes = splits.mini_val
else:
raise ValueError("unknown")
# filter existing scenes.
available_scenes = get_available_scenes(nusc)
available_scene_names = [s["name"] for s in available_scenes]
train_scenes = list(
filter(lambda x: x in available_scene_names, train_scenes)
)
val_scenes = list(filter(lambda x: x in available_scene_names, val_scenes))
train_scenes = set(
[
available_scenes[available_scene_names.index(s)]["token"]
for s in train_scenes
]
)
val_scenes = set(
[
available_scenes[available_scene_names.index(s)]["token"]
for s in val_scenes
]
)
test = "test" in version
if test:
print("test scene: {}".format(len(train_scenes)))
else:
print(
"train scene: {}, val scene: {}".format(
len(train_scenes), len(val_scenes)
)
)
train_nusc_infos, val_nusc_infos = _fill_trainval_infos(
nusc, train_scenes, val_scenes, test, max_sweeps=max_sweeps
)
metadata = dict(version=version)
if test:
print("test sample: {}".format(len(train_nusc_infos)))
data = dict(infos=train_nusc_infos, metadata=metadata)
info_path = "{}_infos_test.pkl".format(info_prefix)
mmcv.dump(data, info_path)
else:
print(
"train sample: {}, val sample: {}".format(
len(train_nusc_infos), len(val_nusc_infos)
)
)
data = dict(infos=train_nusc_infos, metadata=metadata)
info_path = "{}_infos_train.pkl".format(info_prefix)
mmcv.dump(data, info_path)
data["infos"] = val_nusc_infos
info_val_path = "{}_infos_val.pkl".format(info_prefix)
mmcv.dump(data, info_val_path)
def get_available_scenes(nusc):
"""Get available scenes from the input nuscenes class.
Given the raw data, get the information of available scenes for
further info generation.
Args:
nusc (class): Dataset class in the nuScenes dataset.
Returns:
available_scenes (list[dict]): List of basic information for the
available scenes.
"""
available_scenes = []
print("total scene num: {}".format(len(nusc.scene)))
for scene in nusc.scene:
scene_token = scene["token"]
scene_rec = nusc.get("scene", scene_token)
sample_rec = nusc.get("sample", scene_rec["first_sample_token"])
sd_rec = nusc.get("sample_data", sample_rec["data"]["LIDAR_TOP"])
has_more_frames = True
scene_not_exist = False
while has_more_frames:
lidar_path, boxes, _ = nusc.get_sample_data(sd_rec["token"])
lidar_path = str(lidar_path)
if os.getcwd() in lidar_path:
# path from lyftdataset is absolute path
lidar_path = lidar_path.split(f"{os.getcwd()}/")[-1]
# relative path
if not mmcv.is_filepath(lidar_path):
scene_not_exist = True
break
else:
break
if scene_not_exist:
continue
available_scenes.append(scene)
print("exist scene num: {}".format(len(available_scenes)))
return available_scenes
def _fill_trainval_infos(
nusc, train_scenes, val_scenes, test=False, max_sweeps=10
):
"""Generate the train/val infos from the raw data.
Args:
nusc (:obj:`NuScenes`): Dataset class in the nuScenes dataset.
train_scenes (list[str]): Basic information of training scenes.
val_scenes (list[str]): Basic information of validation scenes.
test (bool, optional): Whether use the test mode. In test mode, no
annotations can be accessed. Default: False.
max_sweeps (int, optional): Max number of sweeps. Default: 10.
Returns:
tuple[list[dict]]: Information of training set and validation set
that will be saved to the info file.
"""
train_nusc_infos = []
val_nusc_infos = []
for sample in mmcv.track_iter_progress(nusc.sample):
lidar_token = sample["data"]["LIDAR_TOP"]
sd_rec = nusc.get("sample_data", sample["data"]["LIDAR_TOP"])
cs_record = nusc.get(
"calibrated_sensor", sd_rec["calibrated_sensor_token"]
)
pose_record = nusc.get("ego_pose", sd_rec["ego_pose_token"])
lidar_path, boxes, _ = nusc.get_sample_data(lidar_token)
mmcv.check_file_exist(lidar_path)
info = {
"lidar_path": lidar_path,
"token": sample["token"],
"sweeps": [],
"cams": dict(),
"lidar2ego_translation": cs_record["translation"],
"lidar2ego_rotation": cs_record["rotation"],
"ego2global_translation": pose_record["translation"],
"ego2global_rotation": pose_record["rotation"],
"timestamp": sample["timestamp"],
}
l2e_r = info["lidar2ego_rotation"]
l2e_t = info["lidar2ego_translation"]
e2g_r = info["ego2global_rotation"]
e2g_t = info["ego2global_translation"]
l2e_r_mat = Quaternion(l2e_r).rotation_matrix
e2g_r_mat = Quaternion(e2g_r).rotation_matrix
# obtain 6 image's information per frame
camera_types = [
"CAM_FRONT",
"CAM_FRONT_RIGHT",
"CAM_FRONT_LEFT",
"CAM_BACK",
"CAM_BACK_LEFT",
"CAM_BACK_RIGHT",
]
for cam in camera_types:
cam_token = sample["data"][cam]
cam_path, _, cam_intrinsic = nusc.get_sample_data(cam_token)
cam_info = obtain_sensor2top(
nusc, cam_token, l2e_t, l2e_r_mat, e2g_t, e2g_r_mat, cam
)
cam_info.update(cam_intrinsic=cam_intrinsic)
info["cams"].update({cam: cam_info})
# obtain sweeps for a single key-frame
sd_rec = nusc.get("sample_data", sample["data"]["LIDAR_TOP"])
sweeps = []
while len(sweeps) < max_sweeps:
if not sd_rec["prev"] == "":
sweep = obtain_sensor2top(
nusc,
sd_rec["prev"],
l2e_t,
l2e_r_mat,
e2g_t,
e2g_r_mat,
"lidar",
)
sweeps.append(sweep)
sd_rec = nusc.get("sample_data", sd_rec["prev"])
else:
break
info["sweeps"] = sweeps
# obtain annotation
if not test:
annotations = [
nusc.get("sample_annotation", token)
for token in sample["anns"]
]
locs = np.array([b.center for b in boxes]).reshape(-1, 3)
dims = np.array([b.wlh for b in boxes]).reshape(-1, 3)
rots = np.array(
[b.orientation.yaw_pitch_roll[0] for b in boxes]
).reshape(-1, 1)
velocity = np.array(
[nusc.box_velocity(token)[:2] for token in sample["anns"]]
)
valid_flag = np.array(
[
(anno["num_lidar_pts"] + anno["num_radar_pts"]) > 0
for anno in annotations
],
dtype=bool,
).reshape(-1)
# convert velo from global to lidar
for i in range(len(boxes)):
velo = np.array([*velocity[i], 0.0])
velo = (
velo
@ np.linalg.inv(e2g_r_mat).T
@ np.linalg.inv(l2e_r_mat).T
)
velocity[i] = velo[:2]
names = [b.name for b in boxes]
for i in range(len(names)):
if names[i] in NameMapping:
names[i] = NameMapping[names[i]]
names = np.array(names)
# we need to convert box size to
# the format of our lidar coordinate system
# which is x_size, y_size, z_size (corresponding to l, w, h)
gt_boxes = np.concatenate([locs, dims[:, [1, 0, 2]], rots], axis=1)
assert len(gt_boxes) == len(
annotations
), f"{len(gt_boxes)}, {len(annotations)}"
info["instance_inds"] = np.array(
[
nusc.getind("instance", x["instance_token"])
for x in annotations
]
)
info["gt_boxes"] = gt_boxes
info["gt_names"] = names
info["gt_velocity"] = velocity.reshape(-1, 2)
info["num_lidar_pts"] = np.array(
[a["num_lidar_pts"] for a in annotations]
)
info["num_radar_pts"] = np.array(
[a["num_radar_pts"] for a in annotations]
)
info["valid_flag"] = valid_flag
if sample["scene_token"] in train_scenes:
train_nusc_infos.append(info)
else:
val_nusc_infos.append(info)
return train_nusc_infos, val_nusc_infos
def obtain_sensor2top(
nusc, sensor_token, l2e_t, l2e_r_mat, e2g_t, e2g_r_mat, sensor_type="lidar"
):
"""Obtain the info with RT matric from general sensor to Top LiDAR.
Args:
nusc (class): Dataset class in the nuScenes dataset.
sensor_token (str): Sample data token corresponding to the
specific sensor type.
l2e_t (np.ndarray): Translation from lidar to ego in shape (1, 3).
l2e_r_mat (np.ndarray): Rotation matrix from lidar to ego
in shape (3, 3).
e2g_t (np.ndarray): Translation from ego to global in shape (1, 3).
e2g_r_mat (np.ndarray): Rotation matrix from ego to global
in shape (3, 3).
sensor_type (str, optional): Sensor to calibrate. Default: 'lidar'.
Returns:
sweep (dict): Sweep information after transformation.
"""
sd_rec = nusc.get("sample_data", sensor_token)
cs_record = nusc.get(
"calibrated_sensor", sd_rec["calibrated_sensor_token"]
)
pose_record = nusc.get("ego_pose", sd_rec["ego_pose_token"])
data_path = str(nusc.get_sample_data_path(sd_rec["token"]))
if os.getcwd() in data_path: # path from lyftdataset is absolute path
data_path = data_path.split(f"{os.getcwd()}/")[-1] # relative path
sweep = {
"data_path": data_path,
"type": sensor_type,
"sample_data_token": sd_rec["token"],
"sensor2ego_translation": cs_record["translation"],
"sensor2ego_rotation": cs_record["rotation"],
"ego2global_translation": pose_record["translation"],
"ego2global_rotation": pose_record["rotation"],
"timestamp": sd_rec["timestamp"],
}
l2e_r_s = sweep["sensor2ego_rotation"]
l2e_t_s = sweep["sensor2ego_translation"]
e2g_r_s = sweep["ego2global_rotation"]
e2g_t_s = sweep["ego2global_translation"]
# obtain the RT from sensor to Top LiDAR
# sweep->ego->global->ego'->lidar
l2e_r_s_mat = Quaternion(l2e_r_s).rotation_matrix
e2g_r_s_mat = Quaternion(e2g_r_s).rotation_matrix
R = (l2e_r_s_mat.T @ e2g_r_s_mat.T) @ (
np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T
)
T = (l2e_t_s @ e2g_r_s_mat.T + e2g_t_s) @ (
np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T
)
T -= (
e2g_t @ (np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T)
+ l2e_t @ np.linalg.inv(l2e_r_mat).T
)
sweep["sensor2lidar_rotation"] = R.T # points @ R.T + T
sweep["sensor2lidar_translation"] = T
return sweep
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="nuscenes converter")
parser.add_argument("--root_path", type=str, default="./data/nuscenes")
parser.add_argument("--info_prefix", type=str, default="nuscenes")
parser.add_argument("--version", type=str, default="v1.0-trainval,v1.0-test")
parser.add_argument("--max_sweeps", type=int, default=10)
args = parser.parse_args()
versions = args.version.split(",")
for version in versions:
create_nuscenes_infos(
args.root_path,
args.info_prefix,
version=version,
max_sweeps=args.max_sweeps,
)
# Copyright (c) OpenMMLab. All rights reserved.
import argparse
import mmcv
import os
import torch
import warnings
from mmcv import Config, DictAction
from mmcv.cnn import fuse_conv_bn
from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
from mmcv.runner import (
get_dist_info,
init_dist,
load_checkpoint,
wrap_fp16_model,
)
from mmdet.apis import single_gpu_test, multi_gpu_test, set_random_seed
from mmdet.datasets import replace_ImageToTensor, build_dataset
from mmdet.datasets import build_dataloader as build_dataloader_origin
from mmdet.models import build_detector
from projects.mmdet3d_plugin.datasets.builder import build_dataloader
from projects.mmdet3d_plugin.apis.test import custom_multi_gpu_test
def parse_args():
parser = argparse.ArgumentParser(
description="MMDet test (and eval) a model"
)
parser.add_argument("config", help="test config file path")
parser.add_argument("checkpoint", help="checkpoint file")
parser.add_argument("--out", help="output result file in pickle format")
parser.add_argument(
"--fuse-conv-bn",
action="store_true",
help="Whether to fuse conv and bn, this will slightly increase"
"the inference speed",
)
parser.add_argument(
"--format-only",
action="store_true",
help="Format the output results without perform evaluation. It is"
"useful when you want to format the result to a specific format and "
"submit it to the test server",
)
parser.add_argument(
"--eval",
type=str,
nargs="+",
help='evaluation metrics, which depends on the dataset, e.g., "bbox",'
' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC',
)
parser.add_argument("--show", action="store_true", help="show results")
parser.add_argument(
"--show-dir", help="directory where results will be saved"
)
parser.add_argument(
"--gpu-collect",
action="store_true",
help="whether to use gpu to collect results.",
)
parser.add_argument(
"--tmpdir",
help="tmp directory used for collecting results from multiple "
"workers, available when gpu-collect is not specified",
)
parser.add_argument("--seed", type=int, default=0, help="random seed")
parser.add_argument(
"--deterministic",
action="store_true",
help="whether to set deterministic options for CUDNN backend.",
)
parser.add_argument(
"--cfg-options",
nargs="+",
action=DictAction,
help="override some settings in the used config, the key-value pair "
"in xxx=yyy format will be merged into config file. If the value to "
'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
"Note that the quotation marks are necessary and that no white space "
"is allowed.",
)
parser.add_argument(
"--options",
nargs="+",
action=DictAction,
help="custom options for evaluation, the key-value pair in xxx=yyy "
"format will be kwargs for dataset.evaluate() function (deprecate), "
"change to --eval-options instead.",
)
parser.add_argument(
"--eval-options",
nargs="+",
action=DictAction,
help="custom options for evaluation, the key-value pair in xxx=yyy "
"format will be kwargs for dataset.evaluate() function",
)
parser.add_argument(
"--launcher",
choices=["none", "pytorch", "slurm", "mpi"],
default="none",
help="job launcher",
)
parser.add_argument("--local_rank", type=int, default=0)
parser.add_argument("--result_file", type=str, default=None)
parser.add_argument("--show_only", action="store_true")
args = parser.parse_args()
if "LOCAL_RANK" not in os.environ:
os.environ["LOCAL_RANK"] = str(args.local_rank)
if args.options and args.eval_options:
raise ValueError(
"--options and --eval-options cannot be both specified, "
"--options is deprecated in favor of --eval-options"
)
if args.options:
warnings.warn("--options is deprecated in favor of --eval-options")
args.eval_options = args.options
return args
def main():
args = parse_args()
assert (
args.out or args.eval or args.format_only or args.show or args.show_dir
), (
"Please specify at least one operation (save/eval/format/show the "
'results / save the results) with the argument "--out", "--eval"'
', "--format-only", "--show" or "--show-dir"'
)
if args.eval and args.format_only:
raise ValueError("--eval and --format_only cannot be both specified")
if args.out is not None and not args.out.endswith((".pkl", ".pickle")):
raise ValueError("The output file must be a pkl file.")
cfg = Config.fromfile(args.config)
if args.cfg_options is not None:
cfg.merge_from_dict(args.cfg_options)
# import modules from string list.
if cfg.get("custom_imports", None):
from mmcv.utils import import_modules_from_strings
import_modules_from_strings(**cfg["custom_imports"])
# import modules from plguin/xx, registry will be updated
if hasattr(cfg, "plugin"):
if cfg.plugin:
import importlib
if hasattr(cfg, "plugin_dir"):
plugin_dir = cfg.plugin_dir
_module_dir = os.path.dirname(plugin_dir)
_module_dir = _module_dir.split("/")
_module_path = _module_dir[0]
for m in _module_dir[1:]:
_module_path = _module_path + "." + m
print(_module_path)
plg_lib = importlib.import_module(_module_path)
else:
# import dir is the dirpath for the config file
_module_dir = os.path.dirname(args.config)
_module_dir = _module_dir.split("/")
_module_path = _module_dir[0]
for m in _module_dir[1:]:
_module_path = _module_path + "." + m
print(_module_path)
plg_lib = importlib.import_module(_module_path)
# set cudnn_benchmark
if cfg.get("cudnn_benchmark", False):
torch.backends.cudnn.benchmark = True
cfg.model.pretrained = None
# in case the test dataset is concatenated
samples_per_gpu = 1
if isinstance(cfg.data.test, dict):
cfg.data.test.test_mode = True
samples_per_gpu = cfg.data.test.pop("samples_per_gpu", 1)
if samples_per_gpu > 1:
# Replace 'ImageToTensor' to 'DefaultFormatBundle'
cfg.data.test.pipeline = replace_ImageToTensor(
cfg.data.test.pipeline
)
elif isinstance(cfg.data.test, list):
for ds_cfg in cfg.data.test:
ds_cfg.test_mode = True
samples_per_gpu = max(
[ds_cfg.pop("samples_per_gpu", 1) for ds_cfg in cfg.data.test]
)
if samples_per_gpu > 1:
for ds_cfg in cfg.data.test:
ds_cfg.pipeline = replace_ImageToTensor(ds_cfg.pipeline)
# init distributed env first, since logger depends on the dist info.
if args.launcher == "none":
distributed = False
else:
distributed = True
init_dist(args.launcher, **cfg.dist_params)
# set random seeds
if args.seed is not None:
set_random_seed(args.seed, deterministic=args.deterministic)
# build the dataloader
dataset = build_dataset(cfg.data.test)
print("distributed:", distributed)
if distributed:
data_loader = build_dataloader(
dataset,
samples_per_gpu=samples_per_gpu,
workers_per_gpu=cfg.data.workers_per_gpu,
dist=distributed,
shuffle=False,
nonshuffler_sampler=dict(type="DistributedSampler"),
)
else:
data_loader = build_dataloader_origin(
dataset,
samples_per_gpu=samples_per_gpu,
workers_per_gpu=cfg.data.workers_per_gpu,
dist=distributed,
shuffle=False,
)
# build the model and load checkpoint
cfg.model.train_cfg = None
model = build_detector(cfg.model, test_cfg=cfg.get("test_cfg"))
# model = build_model(cfg.model, test_cfg=cfg.get("test_cfg"))
fp16_cfg = cfg.get("fp16", None)
if fp16_cfg is not None:
wrap_fp16_model(model)
checkpoint = load_checkpoint(model, args.checkpoint, map_location="cpu")
if args.fuse_conv_bn:
model = fuse_conv_bn(model)
# old versions did not save class info in checkpoints, this walkaround is
# for backward compatibility
if "CLASSES" in checkpoint.get("meta", {}):
model.CLASSES = checkpoint["meta"]["CLASSES"]
else:
model.CLASSES = dataset.CLASSES
# palette for visualization in segmentation tasks
if "PALETTE" in checkpoint.get("meta", {}):
model.PALETTE = checkpoint["meta"]["PALETTE"]
elif hasattr(dataset, "PALETTE"):
# segmentation dataset has `PALETTE` attribute
model.PALETTE = dataset.PALETTE
if args.result_file is not None:
outputs = torch.load(args.result_file)
elif not distributed:
model = MMDataParallel(model, device_ids=[0])
outputs = single_gpu_test(model, data_loader, args.show, args.show_dir)
else:
model = MMDistributedDataParallel(
model.cuda(),
device_ids=[torch.cuda.current_device()],
broadcast_buffers=False,
)
outputs = custom_multi_gpu_test(
model, data_loader, args.tmpdir, args.gpu_collect
)
rank, _ = get_dist_info()
if rank == 0:
if args.out:
print(f"\nwriting results to {args.out}")
mmcv.dump(outputs, args.out)
kwargs = {} if args.eval_options is None else args.eval_options
if args.show_only:
eval_kwargs = cfg.get("evaluation", {}).copy()
# hard-code way to remove EvalHook args
for key in [
"interval",
"tmpdir",
"start",
"gpu_collect",
"save_best",
"rule",
]:
eval_kwargs.pop(key, None)
eval_kwargs.update(kwargs)
dataset.show(outputs, show=True, **eval_kwargs)
elif args.format_only:
dataset.format_results(outputs, **kwargs)
elif args.eval:
eval_kwargs = cfg.get("evaluation", {}).copy()
# hard-code way to remove EvalHook args
for key in [
"interval",
"tmpdir",
"start",
"gpu_collect",
"save_best",
"rule",
]:
eval_kwargs.pop(key, None)
eval_kwargs.update(dict(metric=args.eval, **kwargs))
print(eval_kwargs)
results_dict = dataset.evaluate(outputs, **eval_kwargs)
print(results_dict)
if __name__ == "__main__":
torch.multiprocessing.set_start_method(
"fork"
) # use fork workers_per_gpu can be > 1
main()
# Copyright (c) OpenMMLab. All rights reserved.
from __future__ import division
import sys
import os
print(sys.executable, os.path.abspath(__file__))
# import init_paths # for conda pkgs submitting method
import argparse
import copy
import mmcv
import time
import torch
import warnings
from mmcv import Config, DictAction
from mmcv.runner import get_dist_info, init_dist
from os import path as osp
from mmdet import __version__ as mmdet_version
from mmdet.apis import train_detector
from mmdet.datasets import build_dataset
from mmdet.models import build_detector
from mmdet.utils import collect_env, get_root_logger
from mmdet.apis import set_random_seed
from torch import distributed as dist
from datetime import timedelta
import cv2
cv2.setNumThreads(8)
def parse_args():
parser = argparse.ArgumentParser(description="Train a detector")
parser.add_argument("config", help="train config file path")
parser.add_argument("--work-dir", help="the dir to save logs and models")
parser.add_argument(
"--resume-from", help="the checkpoint file to resume from"
)
parser.add_argument(
"--no-validate",
action="store_true",
help="whether not to evaluate the checkpoint during training",
)
group_gpus = parser.add_mutually_exclusive_group()
group_gpus.add_argument(
"--gpus",
type=int,
help="number of gpus to use "
"(only applicable to non-distributed training)",
)
group_gpus.add_argument(
"--gpu-ids",
type=int,
nargs="+",
help="ids of gpus to use "
"(only applicable to non-distributed training)",
)
parser.add_argument("--seed", type=int, default=0, help="random seed")
parser.add_argument(
"--deterministic",
action="store_true",
help="whether to set deterministic options for CUDNN backend.",
)
parser.add_argument(
"--options",
nargs="+",
action=DictAction,
help="override some settings in the used config, the key-value pair "
"in xxx=yyy format will be merged into config file (deprecate), "
"change to --cfg-options instead.",
)
parser.add_argument(
"--cfg-options",
nargs="+",
action=DictAction,
help="override some settings in the used config, the key-value pair "
"in xxx=yyy format will be merged into config file. If the value to "
'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
"Note that the quotation marks are necessary and that no white space "
"is allowed.",
)
parser.add_argument(
"--dist-url",
type=str,
default="auto",
help="dist url for init process, such as tcp://localhost:8000",
)
parser.add_argument("--gpus-per-machine", type=int, default=8)
parser.add_argument(
"--launcher",
choices=["none", "pytorch", "slurm", "mpi", "mpi_nccl"],
default="none",
help="job launcher",
)
parser.add_argument("--local_rank", type=int, default=0)
parser.add_argument(
"--autoscale-lr",
action="store_true",
help="automatically scale lr with the number of gpus",
)
args = parser.parse_args()
if "LOCAL_RANK" not in os.environ:
os.environ["LOCAL_RANK"] = str(args.local_rank)
if args.options and args.cfg_options:
raise ValueError(
"--options and --cfg-options cannot be both specified, "
"--options is deprecated in favor of --cfg-options"
)
if args.options:
warnings.warn("--options is deprecated in favor of --cfg-options")
args.cfg_options = args.options
return args
def main():
args = parse_args()
cfg = Config.fromfile(args.config)
if args.cfg_options is not None:
cfg.merge_from_dict(args.cfg_options)
# import modules from string list.
if cfg.get("custom_imports", None):
from mmcv.utils import import_modules_from_strings
import_modules_from_strings(**cfg["custom_imports"])
# import modules from plguin/xx, registry will be updated
if hasattr(cfg, "plugin"):
if cfg.plugin:
import importlib
if hasattr(cfg, "plugin_dir"):
plugin_dir = cfg.plugin_dir
_module_dir = os.path.dirname(plugin_dir)
_module_dir = _module_dir.split("/")
_module_path = _module_dir[0]
for m in _module_dir[1:]:
_module_path = _module_path + "." + m
print(_module_path)
plg_lib = importlib.import_module(_module_path)
else:
# import dir is the dirpath for the config file
_module_dir = os.path.dirname(args.config)
_module_dir = _module_dir.split("/")
_module_path = _module_dir[0]
for m in _module_dir[1:]:
_module_path = _module_path + "." + m
print(_module_path)
plg_lib = importlib.import_module(_module_path)
from projects.mmdet3d_plugin.apis.train import custom_train_model
# set cudnn_benchmark
if cfg.get("cudnn_benchmark", False):
torch.backends.cudnn.benchmark = True
# work_dir is determined in this priority: CLI > segment in file > filename
if args.work_dir is not None:
# update configs according to CLI args if args.work_dir is not None
cfg.work_dir = args.work_dir
elif cfg.get("work_dir", None) is None:
# use config filename as default work_dir if cfg.work_dir is None
cfg.work_dir = osp.join(
"./work_dirs", osp.splitext(osp.basename(args.config))[0]
)
if args.resume_from is not None:
cfg.resume_from = args.resume_from
if args.gpu_ids is not None:
cfg.gpu_ids = args.gpu_ids
else:
cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus)
if args.autoscale_lr:
# apply the linear scaling rule (https://arxiv.org/abs/1706.02677)
cfg.optimizer["lr"] = cfg.optimizer["lr"] * len(cfg.gpu_ids) / 8
# init distributed env first, since logger depends on the dist info.
print("args.launcher:",args.launcher)
if args.launcher == "none":
distributed = False
print("cfg.gpu_ids_none:", cfg.gpu_ids)
elif args.launcher == "mpi_nccl":
distributed = True
import mpi4py.MPI as MPI
comm = MPI.COMM_WORLD
mpi_local_rank = comm.Get_rank()
mpi_world_size = comm.Get_size()
print(
"MPI local_rank=%d, world_size=%d"
% (mpi_local_rank, mpi_world_size)
)
# num_gpus = torch.cuda.device_count()
device_ids_on_machines = list(range(args.gpus_per_machine))
str_ids = list(map(str, device_ids_on_machines))
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(str_ids)
torch.cuda.set_device(mpi_local_rank % args.gpus_per_machine)
dist.init_process_group(
backend="nccl",
init_method=args.dist_url,
world_size=mpi_world_size,
rank=mpi_local_rank,
timeout=timedelta(seconds=3600),
)
cfg.gpu_ids = range(mpi_world_size)
print("cfg.gpu_ids_mpi_nccl:", cfg.gpu_ids)
else:
distributed = True
init_dist(
args.launcher, timeout=timedelta(seconds=3600), **cfg.dist_params
)
# re-set gpu_ids with distributed training mode
_, world_size = get_dist_info()
cfg.gpu_ids = range(world_size)
print("cfg.gpu_ids_others:", cfg.gpu_ids)
# create work_dir
mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))
# dump config
cfg.dump(osp.join(cfg.work_dir, osp.basename(args.config)))
# init the logger before other steps
timestamp = time.strftime("%Y%m%d_%H%M%S", time.localtime())
log_file = osp.join(cfg.work_dir, f"{timestamp}.log")
# specify logger name, if we still use 'mmdet', the output info will be
# filtered and won't be saved in the log_file
# TODO: ugly workaround to judge whether we are training det or seg model
logger = get_root_logger(
log_file=log_file, log_level=cfg.log_level
)
# init the meta dict to record some important information such as
# environment info and seed, which will be logged
meta = dict()
# log env info
env_info_dict = collect_env()
env_info = "\n".join([(f"{k}: {v}") for k, v in env_info_dict.items()])
dash_line = "-" * 60 + "\n"
logger.info(
"Environment info:\n" + dash_line + env_info + "\n" + dash_line
)
meta["env_info"] = env_info
meta["config"] = cfg.pretty_text
# log some basic info
logger.info(f"Distributed training: {distributed}")
logger.info(f"Config:\n{cfg.pretty_text}")
# set random seeds
if args.seed is not None:
logger.info(
f"Set random seed to {args.seed}, "
f"deterministic: {args.deterministic}"
)
set_random_seed(args.seed, deterministic=args.deterministic)
cfg.seed = args.seed
meta["seed"] = args.seed
meta["exp_name"] = osp.basename(args.config)
model = build_detector(
cfg.model, train_cfg=cfg.get("train_cfg"), test_cfg=cfg.get("test_cfg")
)
model.init_weights()
logger.info(f"Model:\n{model}")
datasets = [build_dataset(cfg.data.train)]
if len(cfg.workflow) == 2:
val_dataset = copy.deepcopy(cfg.data.val)
# in case we use a dataset wrapper
if "dataset" in cfg.data.train:
val_dataset.pipeline = cfg.data.train.dataset.pipeline
else:
val_dataset.pipeline = cfg.data.train.pipeline
# set test_mode=False here in deep copied config
# which do not affect AP/AR calculation later
# refer to https://mmdetection3d.readthedocs.io/en/latest/tutorials/customize_runtime.html#customize-workflow # noqa
val_dataset.test_mode = False
datasets.append(build_dataset(val_dataset))
if cfg.checkpoint_config is not None:
# save mmdet version, config file content and class names in
# checkpoints as meta data
cfg.checkpoint_config.meta = dict(
mmdet_version=mmdet_version,
config=cfg.pretty_text,
CLASSES=datasets[0].CLASSES,
)
# add an attribute for visualization convenience
model.CLASSES = datasets[0].CLASSES
torch.backends.cudnn.benchmark = True # 启用自动寻找最优卷积算法
if hasattr(cfg, "plugin"):
custom_train_model(
model,
datasets,
cfg,
distributed=distributed,
validate=(not args.no_validate),
timestamp=timestamp,
meta=meta,
)
else:
train_detector(
model,
datasets,
cfg,
distributed=distributed,
validate=(not args.no_validate),
timestamp=timestamp,
meta=meta,
)
if __name__ == "__main__":
torch.multiprocessing.set_start_method(
"fork"
) # use fork workers_per_gpu can be > 1
main()
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment