init_0905

a9dc86e9 · lishj6 · 18eda5c1 · a9dc86e9 · a9dc86e9 · a9dc86e9
Commit a9dc86e9 authored Sep 05, 2025 by lishj6 🏸
19 changed files
--- a/projects/mmdet3d_plugin/ops/src/deformable_aggregation_cuda_new.cu
+++ b/projects/mmdet3d_plugin/ops/src/deformable_aggregation_cuda_new.cu
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <THC/THCAtomics.cuh>
+
+#include <iostream>
+#include <stdlib.h>
+
+
+__device__ float bilinear_sampling(
+    const float *&bottom_data, const int &height, const int &width,
+    const int &num_embeds, const float &h_im, const float &w_im,
+    const int &base_ptr
+) {
+  const int h_low = floorf(h_im);
+  const int w_low = floorf(w_im);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const float lh = h_im - h_low;
+  const float lw = w_im - w_low;
+  const float hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = num_embeds;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+
+  float v1 = 0;
+  if (h_low >= 0 && w_low >= 0) {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+  }
+  float v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1) {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+  }
+  float v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0) {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+  }
+  float v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1) {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+  }
+
+  const float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  const float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+
+__device__ void bilinear_sampling_grad(
+    const float *&bottom_data, const float &weight,
+    const int &height, const int &width,
+    const int &num_embeds, const float &h_im, const float &w_im,
+    const int &base_ptr,
+    const float &grad_output,
+    float *&grad_mc_ms_feat, float *grad_sampling_location, float *grad_weights) {
+  const int h_low = floorf(h_im);
+  const int w_low = floorf(w_im);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const float lh = h_im - h_low;
+  const float lw = w_im - w_low;
+  const float hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = num_embeds;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+
+  const float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+  const float top_grad_mc_ms_feat = grad_output * weight;
+  float grad_h_weight = 0, grad_w_weight = 0;
+
+  float v1 = 0;
+  if (h_low >= 0 && w_low >= 0) {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+    grad_h_weight -= hw * v1;
+    grad_w_weight -= hh * v1;
+
+    // atomicAdd(grad_mc_ms_feat + ptr1, w1 * top_grad_mc_ms_feat);
+
+    #ifdef __gfx936__
+      __builtin_amdgcn_global_atomic_fadd_f32(grad_mc_ms_feat + ptr1, w1 * top_grad_mc_ms_feat);
+    #else
+      atomicAdd(grad_mc_ms_feat + ptr1, w1 * top_grad_mc_ms_feat);
+    #endif
+
+  }
+  float v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1) {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+    grad_h_weight -= lw * v2;
+    grad_w_weight += hh * v2;
+
+    // atomicAdd(grad_mc_ms_feat + ptr2, w2 * top_grad_mc_ms_feat);
+
+    #ifdef __gfx936__
+      __builtin_amdgcn_global_atomic_fadd_f32(grad_mc_ms_feat + ptr2, w2 * top_grad_mc_ms_feat);
+    #else
+      atomicAdd(grad_mc_ms_feat + ptr2, w2 * top_grad_mc_ms_feat);
+    #endif
+
+  }
+  float v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0) {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+    grad_h_weight += hw * v3;
+    grad_w_weight -= lh * v3;
+    // atomicAdd(grad_mc_ms_feat + ptr3, w3 * top_grad_mc_ms_feat);
+
+    #ifdef __gfx936__
+      __builtin_amdgcn_global_atomic_fadd_f32(grad_mc_ms_feat + ptr3, w3 * top_grad_mc_ms_feat);
+    #else
+      atomicAdd(grad_mc_ms_feat + ptr3, w3 * top_grad_mc_ms_feat);
+    #endif
+
+  }
+  float v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1) {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+    grad_h_weight += lw * v4;
+    grad_w_weight += lh * v4;
+    // atomicAdd(grad_mc_ms_feat + ptr4, w4 * top_grad_mc_ms_feat);
+
+    #ifdef __gfx936__
+      __builtin_amdgcn_global_atomic_fadd_f32(grad_mc_ms_feat + ptr4, w4 * top_grad_mc_ms_feat);
+    #else
+      atomicAdd(grad_mc_ms_feat + ptr4, w4 * top_grad_mc_ms_feat);
+    #endif
+
+  }
+
+  const float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  // atomicAdd(grad_weights, grad_output * val);
+  // atomicAdd(grad_sampling_location, width * grad_w_weight * top_grad_mc_ms_feat);
+  // atomicAdd(grad_sampling_location + 1, height * grad_h_weight * top_grad_mc_ms_feat);
+
+
+  #ifdef __gfx936__
+    __builtin_amdgcn_global_atomic_fadd_f32(grad_weights, grad_output * val);
+    __syncthreads();
+    __builtin_amdgcn_global_atomic_fadd_f32(grad_sampling_location, width * grad_w_weight * top_grad_mc_ms_feat);
+    __builtin_amdgcn_global_atomic_fadd_f32(grad_sampling_location + 1, height * grad_h_weight * top_grad_mc_ms_feat);
+  #else
+    atomicAdd(grad_weights, grad_output * val);
+    atomicAdd(grad_sampling_location, width * grad_w_weight * top_grad_mc_ms_feat);
+    atomicAdd(grad_sampling_location + 1, height * grad_h_weight * top_grad_mc_ms_feat);
+  #endif
+}
+
+
+__global__ void deformable_aggregation_kernel(
+    const int num_kernels,
+    float* output,
+    const float* mc_ms_feat,
+    const int* spatial_shape,
+    const int* scale_start_index,
+    const float* sample_location,
+    const float* weights,
+    int batch_size,
+    int num_cams,
+    int num_feat,
+    int num_embeds,
+    int num_scale,
+    int num_anchors,
+    int num_pts,
+    int num_groups
+) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= num_kernels) return;
+
+    const float weight = *(weights + idx / (num_embeds / num_groups));
+    const int channel_index = idx % num_embeds;
+    idx /= num_embeds;
+    const int scale_index = idx % num_scale;
+    idx /= num_scale;
+
+    const int cam_index = idx % num_cams;
+    idx /= num_cams;
+    const int pts_index = idx % num_pts;
+    idx /= num_pts;
+
+    int anchor_index = idx % num_anchors;
+    idx /= num_anchors;
+    const int batch_index = idx % batch_size;
+    idx /= batch_size;
+
+    anchor_index = batch_index * num_anchors + anchor_index;
+    const int loc_offset = ((anchor_index * num_pts + pts_index) * num_cams + cam_index) << 1;
+
+    const float loc_w = sample_location[loc_offset];
+    if (loc_w <= 0 || loc_w >= 1) return;
+    const float loc_h = sample_location[loc_offset + 1];
+    if (loc_h <= 0 || loc_h >= 1) return;
+    
+    int cam_scale_index = cam_index * num_scale + scale_index;
+    const int value_offset = (batch_index * num_feat + scale_start_index[cam_scale_index]) * num_embeds + channel_index;
+
+    cam_scale_index = cam_scale_index << 1;
+    const int h = spatial_shape[cam_scale_index];
+    const int w = spatial_shape[cam_scale_index + 1];
+
+    const float h_im = loc_h * h - 0.5;
+    const float w_im = loc_w * w - 0.5;
+
+    // atomicAdd(
+    //     output + anchor_index * num_embeds + channel_index,
+    //     bilinear_sampling(mc_ms_feat, h, w, num_embeds, h_im, w_im, value_offset) * weight
+    // );
+
+    #ifdef __gfx936__
+        __builtin_amdgcn_global_atomic_fadd_f32(output + anchor_index * num_embeds + channel_index, bilinear_sampling(mc_ms_feat, h, w, num_embeds, h_im, w_im, value_offset) * weight);
+    #else
+        atomicAdd(output + anchor_index * num_embeds + channel_index, bilinear_sampling(mc_ms_feat, h, w, num_embeds, h_im, w_im, value_offset) * weight);
+    #endif
+
+}
+
+
+__global__ void deformable_aggregation_grad_kernel(
+    const int num_kernels,
+    const float* mc_ms_feat,
+    const int* spatial_shape,
+    const int* scale_start_index,
+    const float* sample_location,
+    const float* weights,
+    const float* grad_output,
+    float* grad_mc_ms_feat,
+    float* grad_sampling_location,
+    float* grad_weights,
+    int batch_size,
+    int num_cams,
+    int num_feat,
+    int num_embeds,
+    int num_scale,
+    int num_anchors,
+    int num_pts,
+    int num_groups
+) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= num_kernels) return;
+
+    const int weights_ptr = idx / (num_embeds / num_groups);
+    const int channel_index = idx % num_embeds;
+    idx /= num_embeds;
+    const int scale_index = idx % num_scale;
+    idx /= num_scale;
+
+    const int cam_index = idx % num_cams;
+    idx /= num_cams;
+    const int pts_index = idx % num_pts;
+    idx /= num_pts;
+
+    int anchor_index = idx % num_anchors;
+    idx /= num_anchors;
+    const int batch_index = idx % batch_size;
+    idx /= batch_size;
+
+    anchor_index = batch_index * num_anchors + anchor_index;
+    const int loc_offset = ((anchor_index * num_pts + pts_index) * num_cams + cam_index) << 1;
+
+    const float loc_w = sample_location[loc_offset];
+    if (loc_w <= 0 || loc_w >= 1) return;
+    const float loc_h = sample_location[loc_offset + 1];
+    if (loc_h <= 0 || loc_h >= 1) return;
+    
+    const float grad = grad_output[anchor_index*num_embeds + channel_index];
+
+    int cam_scale_index = cam_index * num_scale + scale_index;
+    const int value_offset = (batch_index * num_feat + scale_start_index[cam_scale_index]) * num_embeds + channel_index;
+
+    cam_scale_index = cam_scale_index << 1;
+    const int h = spatial_shape[cam_scale_index];
+    const int w = spatial_shape[cam_scale_index + 1];
+
+    const float h_im = loc_h * h - 0.5;
+    const float w_im = loc_w * w - 0.5;
+
+    /* atomicAdd( */
+    /*     output + anchor_index * num_embeds + channel_index, */
+    /*     bilinear_sampling(mc_ms_feat, h, w, num_embeds, h_im, w_im, value_offset) * weight */
+    /* ); */
+    const float weight = weights[weights_ptr];
+    float *grad_weights_ptr = grad_weights + weights_ptr;
+    float *grad_location_ptr = grad_sampling_location + loc_offset;
+    bilinear_sampling_grad(
+        mc_ms_feat, weight, h, w, num_embeds, h_im, w_im,
+        value_offset,
+        grad,
+        grad_mc_ms_feat, grad_location_ptr, grad_weights_ptr
+    );
+}
+
+
+void deformable_aggregation(
+    float* output,
+    const float* mc_ms_feat,
+    const int* spatial_shape,
+    const int* scale_start_index,
+    const float* sample_location,
+    const float* weights,
+    int batch_size,
+    int num_cams,
+    int num_feat,
+    int num_embeds,
+    int num_scale,
+    int num_anchors,
+    int num_pts,
+    int num_groups
+) {
+    const int num_kernels = batch_size * num_pts * num_embeds * num_anchors * num_cams * num_scale;
+    deformable_aggregation_kernel
+        <<<(int)ceil(((double)num_kernels/128)), 128>>>(
+        num_kernels, output,
+        mc_ms_feat, spatial_shape, scale_start_index, sample_location, weights,
+        batch_size, num_cams, num_feat, num_embeds, num_scale, num_anchors, num_pts, num_groups
+    );
+}
+
+
+void deformable_aggregation_grad(
+  const float* mc_ms_feat,
+  const int* spatial_shape,
+  const int* scale_start_index,
+  const float* sample_location,
+  const float* weights,
+  const float* grad_output,
+  float* grad_mc_ms_feat,
+  float* grad_sampling_location,
+  float* grad_weights,
+  int batch_size,
+  int num_cams,
+  int num_feat,
+  int num_embeds,
+  int num_scale,
+  int num_anchors,
+  int num_pts,
+  int num_groups
+) {
+    const int num_kernels = batch_size * num_pts * num_embeds * num_anchors * num_cams * num_scale;
+    deformable_aggregation_grad_kernel
+        <<<(int)ceil(((double)num_kernels/128)), 128>>>(
+        num_kernels,
+        mc_ms_feat, spatial_shape, scale_start_index, sample_location, weights,
+        grad_output, grad_mc_ms_feat, grad_sampling_location, grad_weights,
+        batch_size, num_cams, num_feat, num_embeds, num_scale, num_anchors, num_pts, num_groups
+    );
+}
--- a/projects/mmdet3d_plugin/ops/src/deformable_aggregation_hip.cpp
+++ b/projects/mmdet3d_plugin/ops/src/deformable_aggregation_hip.cpp
+// !!! This is a file automatically generated by hipify!!!
+#include <ATen/dtk_macros.h>
+#include <torch/extension.h>
+#include <ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h>
+
+void deformable_aggregation(
+  float* output,
+  const float* mc_ms_feat,
+  const int* spatial_shape,
+  const int* scale_start_index,
+  const float* sample_location,
+  const float* weights,
+  int batch_size,
+  int num_cams,
+  int num_feat,
+  int num_embeds,
+  int num_scale,
+  int num_anchors,
+  int num_pts,
+  int num_groups
+);
+  
+
+/* feat: bs, num_feat, c */
+/* _spatial_shape: cam, scale, 2 */
+/* _scale_start_index: cam, scale */
+/* _sampling_location: bs, anchor, pts, cam, 2 */
+/* _weights: bs, anchor, pts, cam, scale, group */
+/* output: bs, anchor, c */
+/* kernel: bs, anchor, pts, c */
+
+
+at::Tensor deformable_aggregation_forward(
+  const at::Tensor &_mc_ms_feat,
+  const at::Tensor &_spatial_shape,
+  const at::Tensor &_scale_start_index,
+  const at::Tensor &_sampling_location,
+  const at::Tensor &_weights
+) {
+  at::DeviceGuard guard(_mc_ms_feat.device());
+  const at::hip::OptionalHIPGuardMasqueradingAsCUDA device_guard(device_of(_mc_ms_feat));
+  int batch_size = _mc_ms_feat.size(0);
+  int num_feat = _mc_ms_feat.size(1);
+  int num_embeds = _mc_ms_feat.size(2);
+  int num_cams = _spatial_shape.size(0);
+  int num_scale = _spatial_shape.size(1);
+  int num_anchors = _sampling_location.size(1);
+  int num_pts = _sampling_location.size(2);
+  int num_groups = _weights.size(5);
+
+  const float* mc_ms_feat = _mc_ms_feat.data_ptr<float>();
+  const int* spatial_shape = _spatial_shape.data_ptr<int>();
+  const int* scale_start_index = _scale_start_index.data_ptr<int>();
+  const float* sampling_location = _sampling_location.data_ptr<float>();
+  const float* weights = _weights.data_ptr<float>();
+
+  auto output = at::zeros({batch_size, num_anchors, num_embeds}, _mc_ms_feat.options());
+  deformable_aggregation(
+    output.data_ptr<float>(),
+    mc_ms_feat, spatial_shape, scale_start_index, sampling_location, weights,
+    batch_size, num_cams, num_feat, num_embeds, num_scale, num_anchors, num_pts, num_groups
+  );
+  return output;
+}
+
+
+void deformable_aggregation_grad(
+  const float* mc_ms_feat,
+  const int* spatial_shape,
+  const int* scale_start_index,
+  const float* sample_location,
+  const float* weights,
+  const float* grad_output,
+  float* grad_mc_ms_feat,
+  float* grad_sampling_location,
+  float* grad_weights,
+  int batch_size,
+  int num_cams,
+  int num_feat,
+  int num_embeds,
+  int num_scale,
+  int num_anchors,
+  int num_pts,
+  int num_groups
+);
+
+
+void deformable_aggregation_backward(
+  const at::Tensor &_mc_ms_feat,
+  const at::Tensor &_spatial_shape,
+  const at::Tensor &_scale_start_index,
+  const at::Tensor &_sampling_location,
+  const at::Tensor &_weights,
+  const at::Tensor &_grad_output,
+  at::Tensor &_grad_mc_ms_feat,
+  at::Tensor &_grad_sampling_location,
+  at::Tensor &_grad_weights
+) {
+  at::DeviceGuard guard(_mc_ms_feat.device());
+  const at::hip::OptionalHIPGuardMasqueradingAsCUDA device_guard(device_of(_mc_ms_feat));
+  int batch_size = _mc_ms_feat.size(0);
+  int num_feat = _mc_ms_feat.size(1);
+  int num_embeds = _mc_ms_feat.size(2);
+  int num_cams = _spatial_shape.size(0);
+  int num_scale = _spatial_shape.size(1);
+  int num_anchors = _sampling_location.size(1);
+  int num_pts = _sampling_location.size(2);
+  int num_groups = _weights.size(5);
+
+  const float* mc_ms_feat = _mc_ms_feat.data_ptr<float>();
+  const int* spatial_shape = _spatial_shape.data_ptr<int>();
+  const int* scale_start_index = _scale_start_index.data_ptr<int>();
+  const float* sampling_location = _sampling_location.data_ptr<float>();
+  const float* weights = _weights.data_ptr<float>();
+  const float* grad_output = _grad_output.data_ptr<float>();
+
+  float* grad_mc_ms_feat = _grad_mc_ms_feat.data_ptr<float>();
+  float* grad_sampling_location = _grad_sampling_location.data_ptr<float>();
+  float* grad_weights = _grad_weights.data_ptr<float>();
+
+  deformable_aggregation_grad(
+    mc_ms_feat, spatial_shape, scale_start_index, sampling_location, weights,
+    grad_output, grad_mc_ms_feat, grad_sampling_location, grad_weights,
+    batch_size, num_cams, num_feat, num_embeds, num_scale, num_anchors, num_pts, num_groups
+  );
+}
+
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def(
+    "deformable_aggregation_forward",
+    &deformable_aggregation_forward,
+    "deformable_aggregation_forward"
+  );
+  m.def(
+    "deformable_aggregation_backward",
+    &deformable_aggregation_backward,
+    "deformable_aggregation_backward"
+  );
+}
--- a/readme.md
+++ b/readme.md
+<div align="center">   
+  
+# Sparse4D: Sparse-based End-to-end Multi-view Temporal Perception
+</div>

+> [Github](https://github.com/HorizonRobotics/Sparse4D) \
+> [Sparse4D v1: Multi-view 3D Object Detection with Sparse Spatial-Temporal Fusion](https://arxiv.org/abs/2211.10581) \
+> [Sparse4D v2: Recurrent Temporal Fusion with Sparse Model](https://arxiv.org/abs/2305.14018) \
+> [Sparse4D v3: Advancing End-to-End 3D Detection and Tracking](https://arxiv.org/abs/2311.11722) \
+> [SparseDrive: End-to-End Autonomous Driving via Sparse Scene Representation](https://arxiv.org/abs/2405.19620) \
+> [Chinese Interpretation of the Papers](https://zhuanlan.zhihu.com/p/637096473)
+
+**【2024/06】Our follow-up project, SparseDrive (an end-to-end planning model based on the sparse framework), has been released!!! [arxiv](https://arxiv.org/abs/2405.19620) & [github](https://github.com/swc-17/SparseDrive).**
+
+## Overall Architecture
+<center>
+    <img style="border-radius: 0.3125em;
+    box-shadow: 0 2px 4px 0 rgba(34,36,38,.12),0 2px 10px 0 rgba(34,36,38,.08);" 
+    src="resources/sparse4d_architecture.jpg" width="1000">
+    <br>
+    <div style="color:orange; border-bottom: 1px solid #d9d9d9;
+    display: inline-block;
+    color: #999;
+    padding: 2px;">Overall Framework of Sparse4D, which conforms to an encoder-decoder structure. The inputs mainly consists of three components: multi-view images, newly initialized instances, propagated instances from previous frame. The output is the refined instances (3D anchor boxes and corresponding features), serve as the perception results for the current frame. Additionally, a subset of these refined instances is selected and propagated to the next frame.</div>
+</center>
+
+
+<center>
+    <img style="border-radius: 0.3125em;
+    box-shadow: 0 2px 4px 0 rgba(34,36,38,.12),0 2px 10px 0 rgba(34,36,38,.08);" 
+    src="resources/efficient_deformable_aggregation.jpg" width="1000">
+    <br>
+    <div style="color:orange; border-bottom: 1px solid #d9d9d9;
+    display: inline-block;
+    color: #999;
+    padding: 2px;"> Illustration of our Efficient Deformable Aggregation Module. (a) The basic pipeline: we first generate multiple 3D key points inside 3D anchor, then sampling multi-scale/view image feature for each keypoint, and fuse these feature with predicted weight. (b) The parallel implementation: to further improve speed and reduce memory cost, we achieve a parallel implementation, where feature sampling and multi-view/scale weighted sum are combined as a CUDA operation. Our CUDA implementation supports handling different feature resolutions from different views. </div>
+</center>
+
+
+## nuScenes Benchmark
+### Results on Validation Split
+These experiments were conducted using 8 RTX 3090 GPUs with 24 GB memory.
+|model | backbone |pretrain| img size | Epoch | Traning | FPS | NDS | mAP |  AMOTA |AMOTP |IDS| config | ckpt | log |
+|  :----:  | :---: | :---: | :---: | :---: | :---:| :---:|:---:|:---: | :---: | :----: | :----: | :---: | :----: | :----: |
+|Sparse4D-T4 |Res101|[FCOS3D](https://github.com/linxuewu/Sparse4D/releases/download/v0.0/fcos3d.pth)|640x1600|24|2Day5H|2.9|0.5438|0.4409|-|-|-|[cfg](https://github.com/linxuewu/Sparse4D/blob/v2.0/projects/configs/sparse4d_r101_H4.py)|[ckpt](https://github.com/linxuewu/Sparse4D/releases/download/v0.0/sparse4dv1_r101_H4_release.pth)|[log](https://github.com/linxuewu/Sparse4D/releases/download/v0.0/sparse4d.log)|
+|Sparse4Dv2|Res50|[ImageNet]()|256x704| 100 |15H | 20.3 |0.5384|0.4392|-|-|-|[cfg](https://github.com/linxuewu/Sparse4D/blob/v2.0/projects/configs/sparse4dv2_r50_HInf_256x704.py)|[ckpt](https://github.com/linxuewu/Sparse4D/releases/download/v0.0/sparse4dv2_r50_HInf_256x704.pth)|[log](https://github.com/linxuewu/Sparse4D/releases/download/v0.0/sparse4dv2_r50_HInf_256x704.log.json)|
+|Sparse4Dv2|Res101|[nuImage](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/cascade_mask_rcnn_r101_fpn_1x_nuim/cascade_mask_rcnn_r101_fpn_1x_nuim_20201024_134804-45215b1e.pth)|512x1408| 100 |2Day | 8.4 |0.5939|0.5051|-|-|-|-|-|-|
+|Sparse4Dv3|Res50|[ImageNet]()|256x704| 100 |22H | 19.8 |0.5637|0.4646|0.477|1.167|456|[cfg](https://github.com/HorizonRobotics/Sparse4D/blob/main/projects/configs/sparse4dv3_temporal_r50_1x8_bs6_256x704.py)|[ckpt](https://github.com/HorizonRobotics/Sparse4D/releases/download/v3.0/sparse4dv3_r50.pth)|[log](https://github.com/HorizonRobotics/Sparse4D/releases/download/v3.0/sparse4dv3_r50.log)
+|Sparse4Dv3|Res101|[nuImage](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/cascade_mask_rcnn_r101_fpn_1x_nuim/cascade_mask_rcnn_r101_fpn_1x_nuim_20201024_134804-45215b1e.pth)|512x1408| 100 |2Day | 8.2 |0.623|0.537|0.567|1.027|557|-|-|-|
+
+### Results on Test Split
+|model| backbone | img size | NDS | mAP |mATE| mASE | mAOE |mAVE| mAAE | AMOTA |AMOTP |IDS|
+| :---: | :---: | :---: | :---: | :---:|:---:|:---: | :---: | :----: | :----: | :----: | :----: | :----: |
+|Sparse4D-T4|[VoV-99](https://huggingface.co/Yuxin-CV/EVA-02/blob/main/eva02/det/eva02_L_coco_det_sys_o365.pth)|640x1600|0.595|0.511|0.533|0.263|0.369|0.317|0.124|-|-|-|
+|Sparse4Dv2|[VoV-99](https://huggingface.co/Yuxin-CV/EVA-02/blob/main/eva02/det/eva02_L_coco_det_sys_o365.pth)|640x1600|0.638|0.556|0.462|0.238|0.328|0.264|0.115|-|-|-|
+|Sparse4Dv3|[VoV-99](https://huggingface.co/Yuxin-CV/EVA-02/blob/main/eva02/det/eva02_L_coco_det_sys_o365.pth)|640x1600|0.656|0.570|0.412|0.236|0.312|0.210|0.117|0.574|0.970|669|
+|Sparse4Dv3-offline|[EVA02-large](https://huggingface.co/Yuxin-CV/EVA-02/blob/main/eva02/det/eva02_L_coco_det_sys_o365.pth)|640x1600|0.719|0.668|0.346|0.234|0.279|0.142|0.145|0.677|0.761|514|
+
+PS: In the nuscenes leaderboard, Sparse4Dv3 selected external data=True because the eva02-large pretraining utilized imagenet, object365, and coco, as well as supervised by CLIP. Therefore, we consider using the model pre-trained with eva02 as incorporating external data. **However, we did not use external 3D detection data for training.** This clarification is provided to facilitate fair comparisons.
+
+
+## Quick Start
+[Quick Start](docs/quick_start.md)
+
+## Citation
+```
+@misc{2311.11722,
+    Author = {Xuewu Lin and Zixiang Pei and Tianwei Lin and Lichao Huang and Zhizhong Su},
+    Title = {Sparse4D v3: Advancing End-to-End 3D Detection and Tracking},
+    Year = {2023},
+    Eprint = {arXiv:2311.11722},
+}
+@misc{2305.14018,
+    Author = {Xuewu Lin and Tianwei Lin and Zixiang Pei and Lichao Huang and Zhizhong Su},
+    Title = {Sparse4D v2: Recurrent Temporal Fusion with Sparse Model},
+    Year = {2023},
+    Eprint = {arXiv:2305.14018},
+}
+@misc{2211.10581,
+    Author = {Xuewu Lin and Tianwei Lin and Zixiang Pei and Lichao Huang and Zhizhong Su},
+    Title = {Sparse4D: Multi-view 3D Object Detection with Sparse Spatial-Temporal Fusion},
+    Year = {2022},
+    Eprint = {arXiv:2211.10581},
+}
+```
+
+## Acknowledgement
+- [BEVFormer](https://github.com/fundamentalvision/BEVFormer)
+- [DETR3D](https://github.com/WangYueFt/detr3d) 
+- [mmdet3d](https://github.com/open-mmlab/mmdetection3d)
+- [SOLOFusion](https://github.com/Divadi/SOLOFusion/tree/main/configs/solofusion)
+- [StreamPETR](https://github.com/exiawsh/StreamPETR)
--- a/requirement.txt
+++ b/requirement.txt
+torch==1.13.0+cu116
+torchvision==0.14.0+cu116
+torchaudio==0.13.0+cu116
+numpy==1.23.5
+mmcv_full==1.7.1
+mmdet==2.28.2
+urllib3==1.26.16
+pyquaternion==0.9.9
+nuscenes-devkit==1.1.10
+yapf==0.33.0
+tensorboard==2.14.0
+motmetrics==1.1.3
+pandas==1.1.5
--- a/resources/efficient_deformable_aggregation.jpg
+++ b/resources/efficient_deformable_aggregation.jpg
--- a/resources/sparse4d_architecture.jpg
+++ b/resources/sparse4d_architecture.jpg
--- a/resources/sparse4d_framework.png
+++ b/resources/sparse4d_framework.png
--- a/resources/sparse4dv2_framework.png
+++ b/resources/sparse4dv2_framework.png
--- a/tools/anchor_generator.py
+++ b/tools/anchor_generator.py
+import numpy as np
+from sklearn.cluster import KMeans
+import mmcv
+
+from projects.mmdet3d_plugin.core.box3d import *
+
+
+def get_kmeans_anchor(
+    ann_file,
+    num_anchor=900,
+    detection_range=55,
+    output_file_name="nuscenes_kmeans900.npy",
+    verbose=False,
+):
+    data = mmcv.load(ann_file, file_format="pkl")
+    gt_boxes = np.concatenate([x["gt_boxes"] for x in data["infos"]], axis=0)
+    distance = np.linalg.norm(gt_boxes[:, :3], axis=-1, ord=2)
+    mask = distance <= detection_range
+    gt_boxes = gt_boxes[mask]
+    clf = KMeans(n_clusters=num_anchor, verbose=verbose)
+    print("===========Starting kmeans, please wait.===========")
+    clf.fit(gt_boxes[:, [X, Y, Z]])
+    anchor = np.zeros((num_anchor, 11))
+    anchor[:, [X, Y, Z]] = clf.cluster_centers_
+    anchor[:, [W, L, H]] = np.log(gt_boxes[:, [W, L, H]].mean(axis=0))
+    anchor[:, COS_YAW] = 1
+    np.save(output_file_name, anchor)
+    print(f"===========Done! Save results to {output_file_name}.===========")
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(description="anchor kmeans")
+    parser.add_argument("--ann_file", type=str, required=True)
+    parser.add_argument("--num_anchor", type=int, default=900)
+    parser.add_argument("--detection_range", type=float, default=55)
+    parser.add_argument(
+        "--output_file_name", type=str, default="_nuscenes_kmeans900.npy"
+    )
+    parser.add_argument("--verbose", action="store_true")
+    args = parser.parse_args()
+    get_kmeans_anchor(
+        args.ann_file,
+        args.num_anchor,
+        args.detection_range,
+        args.output_file_name,
+        args.verbose,
+    )
--- a/tools/benchmark.py
+++ b/tools/benchmark.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import time
+import torch
+from mmcv import Config
+from mmcv.parallel import MMDataParallel
+from mmcv.runner import load_checkpoint, wrap_fp16_model
+import sys
+
+sys.path.append(".")
+from projects.mmdet3d_plugin.datasets.builder import build_dataloader
+from projects.mmdet3d_plugin.datasets import custom_build_dataset
+
+# from mmdet3d.datasets import build_dataloader, build_dataset
+from mmdet.models import build_detector
+
+from tools.fuse_conv_bn import fuse_module
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="MMDet benchmark a model")
+    parser.add_argument("config", help="test config file path")
+    parser.add_argument("--checkpoint", default=None, help="checkpoint file")
+    parser.add_argument("--samples", default=2000, help="samples to benchmark")
+    parser.add_argument(
+        "--log-interval", default=50, help="interval of logging"
+    )
+    parser.add_argument(
+        "--fuse-conv-bn",
+        action="store_true",
+        help="Whether to fuse conv and bn, this will slightly increase"
+        "the inference speed",
+    )
+    args = parser.parse_args()
+    return args
+
+
+def get_max_memory(model):
+    device = getattr(model, "output_device", None)
+    mem = torch.cuda.max_memory_allocated(device=device)
+    mem_mb = torch.tensor(
+        [mem / (1024 * 1024)], dtype=torch.int, device=device
+    )
+    return mem_mb.item()
+
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+    # set cudnn_benchmark
+    if cfg.get("cudnn_benchmark", False):
+        torch.backends.cudnn.benchmark = True
+    cfg.model.pretrained = None
+    cfg.data.test.test_mode = True
+
+    # build the dataloader
+    # TODO: support multiple images per gpu (only minor changes are needed)
+    print(cfg.data.test)
+    dataset = custom_build_dataset(cfg.data.test)
+    data_loader = build_dataloader(
+        dataset,
+        samples_per_gpu=1,
+        workers_per_gpu=cfg.data.workers_per_gpu,
+        dist=False,
+        shuffle=False,
+    )
+
+    # build the model and load checkpoint
+    cfg.model.train_cfg = None
+    model = build_detector(cfg.model, test_cfg=cfg.get("test_cfg"))
+    fp16_cfg = cfg.get("fp16", None)
+    if fp16_cfg is not None:
+        wrap_fp16_model(model)
+    if args.checkpoint is not None:
+        load_checkpoint(model, args.checkpoint, map_location="cpu")
+    if args.fuse_conv_bn:
+        model = fuse_module(model)
+
+    model = MMDataParallel(model, device_ids=[0])
+
+    model.eval()
+
+    # the first several iterations may be very slow so skip them
+    num_warmup = 5
+    pure_inf_time = 0
+
+    # benchmark with several samples and take the average
+    max_memory = 0
+    for i, data in enumerate(data_loader):
+        # torch.cuda.synchronize()
+        with torch.no_grad():
+            start_time = time.perf_counter()
+            model(return_loss=False, rescale=True, **data)
+
+            torch.cuda.synchronize()
+            elapsed = time.perf_counter() - start_time
+            max_memory = max(max_memory, get_max_memory(model))
+
+        if i >= num_warmup:
+            pure_inf_time += elapsed
+            if (i + 1) % args.log_interval == 0:
+                fps = (i + 1 - num_warmup) / pure_inf_time
+                print(
+                    f"Done image [{i + 1:<3}/ {args.samples}], "
+                    f"fps: {fps:.1f} img / s, "
+                    f"gpu mem: {max_memory} M"
+                )
+
+        if (i + 1) == args.samples:
+            pure_inf_time += elapsed
+            fps = (i + 1 - num_warmup) / pure_inf_time
+            print(f"Overall fps: {fps:.1f} img / s")
+            break
+
+
+if __name__ == "__main__":
+    main()
--- a/tools/dist_test.sh
+++ b/tools/dist_test.sh
+#!/usr/bin/env bash
+
+CONFIG=$1
+CHECKPOINT=$2
+GPUS=$3
+PORT=${PORT:-29610}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+python3 -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
+    $(dirname "$0")/test.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4}
--- a/tools/dist_train.sh
+++ b/tools/dist_train.sh
+#!/usr/bin/env bash
+
+CONFIG=$1
+GPUS=$2
+PORT=${PORT:-28651}
+
+# export PYTORCH_MIOPEN_SUGGEST_NHWC=1         #.to(memory_format=torch.channels_last)
+# export MIOPEN_FIND_MODE=1
+
+export TORCHINDUCTOR_LAYOUT_OPTIMIZATION=1
+export PYTORCH_MIOPEN_SUGGEST_NHWC=1
+export TORCHINDUCTOR_COORDINATE_DESCENT_TUNING=1
+
+export LD_LIBRARY_PATH=/opt/rocblas-install/lib:$LD_LIBRARY_PATH
+
+export NCCL_ALGO=Ring
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export NCCL_TOPO_FILE=null
+export NCCL_RINGS="N0 0 7 6 5 4 3 2 1 N0|N1 1 2 3 4 5 6 7 0 N1|N2 2 1 0 7 6 5 4 3 N2|N3 3 4 5 6 7 0 1 2 N3|N4 4 3 2 1 0 7 6 5 N4|N5 5 6 7 0 1 2 3 4 N5|N6 6 5 4 3 2 1 0 7 N6|N7 7 0 1 2 3 4 5 6 N7"
+
+
+export  HIP_VISIBLE_DEVICES=1
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+torchrun --nproc_per_node=$GPUS --master_port=$PORT \
+    $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3}
--- a/tools/dist_train_1.sh
+++ b/tools/dist_train_1.sh
+#!/usr/bin/env bash
+
+CONFIG=$1
+GPUS=$2
+PORT=${PORT:-28650}
+
+export NCCL_ALGO=Ring
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export NCCL_TOPO_FILE=null
+export NCCL_RINGS="N0 0 7 6 5 4 3 2 1 N0|N1 1 2 3 4 5 6 7 0 N1|N2 2 1 0 7 6 5 4 3 N2|N3 3 4 5 6 7 0 1 2 N3|N4 4 3 2 1 0 7 6 5 N4|N5 5 6 7 0 1 2 3 4 N5|N6 6 5 4 3 2 1 0 7 N6|N7 7 0 1 2 3 4 5 6 N7"
+
+# PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+# torchrun --nproc_per_node=$GPUS --master_port=$PORT \
+#     $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+torchrun --nnodes 2 --nproc_per_node 8 --node_rank 0 --master_addr=10.16.6.16 --master_port 4519 \
+    $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3} --deterministic
+
+
+
+
+# 设置主节点地址（单机可省略）
+# export MASTER_ADDR="30.149.248.7"
+# export GPU_FLUSH_ON_EXECUTION=1
+export PYTORCH_MIOPEN_SUGGEST_NHWC=1         #.to(memory_format=torch.channels_last)
+export MIOPEN_FIND_MODE=1
+export NCCL_ALGO=Ring
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+# export LD_LIBRARY_PATH=/opt/rocblas-install/lib:$LD_LIBRARY_PATH
+export NCCL_TOPO_FILE=null
+# export NCCL_DEBUG=INFO
+export NCCL_RINGS="N0 0 7 6 5 4 3 2 1 N0|N1 1 2 3 4 5 6 7 0 N1|N2 2 1 0 7 6 5 4 3 N2|N3 3 4 5 6 7 0 1 2 N3|N4 4 3 2 1 0 7 6 5 N4|N5 5 6 7 0 1 2 3 4 N5|N6 6 5 4 3 2 1 0 7 N6|N7 7 0 1 2 3 4 5 6 N7"
+# PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+# torchrun --nnodes 2 --nproc_per_node 8 --node_rank 0 --master_addr=10.16.6.16 --master_port 9528 \
+#     $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3} --deterministic 
+    # --enable-profiler \
+    #--to_channels_last \
--- a/tools/dist_train_2.sh
+++ b/tools/dist_train_2.sh
+#!/usr/bin/env bash
+
+CONFIG=$1
+GPUS=$2
+PORT=${PORT:-28650}
+
+export NCCL_ALGO=Ring
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export NCCL_TOPO_FILE=null
+export NCCL_RINGS="N0 0 7 6 5 4 3 2 1 N0|N1 1 2 3 4 5 6 7 0 N1|N2 2 1 0 7 6 5 4 3 N2|N3 3 4 5 6 7 0 1 2 N3|N4 4 3 2 1 0 7 6 5 N4|N5 5 6 7 0 1 2 3 4 N5|N6 6 5 4 3 2 1 0 7 N6|N7 7 0 1 2 3 4 5 6 N7"
+
+# PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+# torchrun --nproc_per_node=$GPUS --master_port=$PORT \
+#     $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3}
+
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+torchrun --nnodes 2 --nproc_per_node 8 --node_rank 1 --master_addr=10.16.6.16 --master_port 4519 \
+    $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3} --deterministic
\ No newline at end of file
--- a/tools/fuse_conv_bn.py
+++ b/tools/fuse_conv_bn.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+
+import torch
+from torch import nn as nn
+from mmcv.runner import save_checkpoint
+from mmdet.apis import init_detector
+
+
+def fuse_conv_bn(conv, bn):
+    """During inference, the functionary of batch norm layers is turned off but
+    only the mean and var alone channels are used, which exposes the chance to
+    fuse it with the preceding conv layers to save computations and simplify
+    network structures."""
+    conv_w = conv.weight
+    conv_b = (
+        conv.bias
+        if conv.bias is not None
+        else torch.zeros_like(bn.running_mean)
+    )
+
+    factor = bn.weight / torch.sqrt(bn.running_var + bn.eps)
+    conv.weight = nn.Parameter(
+        conv_w * factor.reshape([conv.out_channels, 1, 1, 1])
+    )
+    conv.bias = nn.Parameter((conv_b - bn.running_mean) * factor + bn.bias)
+    return conv
+
+
+def fuse_module(m):
+    last_conv = None
+    last_conv_name = None
+
+    for name, child in m.named_children():
+        if isinstance(child, (nn.BatchNorm2d, nn.SyncBatchNorm)):
+            if last_conv is None:  # only fuse BN that is after Conv
+                continue
+            fused_conv = fuse_conv_bn(last_conv, child)
+            m._modules[last_conv_name] = fused_conv
+            # To reduce changes, set BN as Identity instead of deleting it.
+            m._modules[name] = nn.Identity()
+            last_conv = None
+        elif isinstance(child, nn.Conv2d):
+            last_conv = child
+            last_conv_name = name
+        else:
+            fuse_module(child)
+    return m
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="fuse Conv and BN layers in a model"
+    )
+    parser.add_argument("config", help="config file path")
+    parser.add_argument("checkpoint", help="checkpoint file path")
+    parser.add_argument("out", help="output path of the converted model")
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    # build the model from a config file and a checkpoint file
+    model = init_detector(args.config, args.checkpoint)
+    # fuse conv and bn layers of the model
+    fused_model = fuse_module(model)
+    save_checkpoint(fused_model, args.out)
+
+
+if __name__ == "__main__":
+    main()
--- a/tools/nuscenes_converter.py
+++ b/tools/nuscenes_converter.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+from collections import OrderedDict
+from os import path as osp
+from typing import List, Tuple, Union
+
+import numpy as np
+from nuscenes.nuscenes import NuScenes
+import mmcv
+from pyquaternion import Quaternion
+
+NameMapping = {
+    "movable_object.barrier": "barrier",
+    "vehicle.bicycle": "bicycle",
+    "vehicle.bus.bendy": "bus",
+    "vehicle.bus.rigid": "bus",
+    "vehicle.car": "car",
+    "vehicle.construction": "construction_vehicle",
+    "vehicle.motorcycle": "motorcycle",
+    "human.pedestrian.adult": "pedestrian",
+    "human.pedestrian.child": "pedestrian",
+    "human.pedestrian.construction_worker": "pedestrian",
+    "human.pedestrian.police_officer": "pedestrian",
+    "movable_object.trafficcone": "traffic_cone",
+    "vehicle.trailer": "trailer",
+    "vehicle.truck": "truck",
+}
+
+
+def create_nuscenes_infos(
+    root_path, info_prefix, version="v1.0-trainval", max_sweeps=10
+):
+    """Create info file of nuscene dataset.
+
+    Given the raw data, generate its related info file in pkl format.
+
+    Args:
+        root_path (str): Path of the data root.
+        info_prefix (str): Prefix of the info file to be generated.
+        version (str, optional): Version of the data.
+            Default: 'v1.0-trainval'.
+        max_sweeps (int, optional): Max number of sweeps.
+            Default: 10.
+    """
+    from nuscenes.nuscenes import NuScenes
+
+    nusc = NuScenes(version=version, dataroot=root_path, verbose=True)
+    from nuscenes.utils import splits
+
+    available_vers = ["v1.0-trainval", "v1.0-test", "v1.0-mini"]
+    assert version in available_vers
+    if version == "v1.0-trainval":
+        train_scenes = splits.train
+        val_scenes = splits.val
+    elif version == "v1.0-test":
+        train_scenes = splits.test
+        val_scenes = []
+    elif version == "v1.0-mini":
+        train_scenes = splits.mini_train
+        val_scenes = splits.mini_val
+    else:
+        raise ValueError("unknown")
+
+    # filter existing scenes.
+    available_scenes = get_available_scenes(nusc)
+    available_scene_names = [s["name"] for s in available_scenes]
+    train_scenes = list(
+        filter(lambda x: x in available_scene_names, train_scenes)
+    )
+    val_scenes = list(filter(lambda x: x in available_scene_names, val_scenes))
+    train_scenes = set(
+        [
+            available_scenes[available_scene_names.index(s)]["token"]
+            for s in train_scenes
+        ]
+    )
+    val_scenes = set(
+        [
+            available_scenes[available_scene_names.index(s)]["token"]
+            for s in val_scenes
+        ]
+    )
+
+    test = "test" in version
+    if test:
+        print("test scene: {}".format(len(train_scenes)))
+    else:
+        print(
+            "train scene: {}, val scene: {}".format(
+                len(train_scenes), len(val_scenes)
+            )
+        )
+    train_nusc_infos, val_nusc_infos = _fill_trainval_infos(
+        nusc, train_scenes, val_scenes, test, max_sweeps=max_sweeps
+    )
+
+    metadata = dict(version=version)
+    if test:
+        print("test sample: {}".format(len(train_nusc_infos)))
+        data = dict(infos=train_nusc_infos, metadata=metadata)
+        info_path = "{}_infos_test.pkl".format(info_prefix)
+        mmcv.dump(data, info_path)
+    else:
+        print(
+            "train sample: {}, val sample: {}".format(
+                len(train_nusc_infos), len(val_nusc_infos)
+            )
+        )
+        data = dict(infos=train_nusc_infos, metadata=metadata)
+        info_path = "{}_infos_train.pkl".format(info_prefix)
+        mmcv.dump(data, info_path)
+        data["infos"] = val_nusc_infos
+        info_val_path = "{}_infos_val.pkl".format(info_prefix)
+        mmcv.dump(data, info_val_path)
+
+
+def get_available_scenes(nusc):
+    """Get available scenes from the input nuscenes class.
+
+    Given the raw data, get the information of available scenes for
+    further info generation.
+
+    Args:
+        nusc (class): Dataset class in the nuScenes dataset.
+
+    Returns:
+        available_scenes (list[dict]): List of basic information for the
+            available scenes.
+    """
+    available_scenes = []
+    print("total scene num: {}".format(len(nusc.scene)))
+    for scene in nusc.scene:
+        scene_token = scene["token"]
+        scene_rec = nusc.get("scene", scene_token)
+        sample_rec = nusc.get("sample", scene_rec["first_sample_token"])
+        sd_rec = nusc.get("sample_data", sample_rec["data"]["LIDAR_TOP"])
+        has_more_frames = True
+        scene_not_exist = False
+        while has_more_frames:
+            lidar_path, boxes, _ = nusc.get_sample_data(sd_rec["token"])
+            lidar_path = str(lidar_path)
+            if os.getcwd() in lidar_path:
+                # path from lyftdataset is absolute path
+                lidar_path = lidar_path.split(f"{os.getcwd()}/")[-1]
+                # relative path
+            if not mmcv.is_filepath(lidar_path):
+                scene_not_exist = True
+                break
+            else:
+                break
+        if scene_not_exist:
+            continue
+        available_scenes.append(scene)
+    print("exist scene num: {}".format(len(available_scenes)))
+    return available_scenes
+
+
+def _fill_trainval_infos(
+    nusc, train_scenes, val_scenes, test=False, max_sweeps=10
+):
+    """Generate the train/val infos from the raw data.
+
+    Args:
+        nusc (:obj:`NuScenes`): Dataset class in the nuScenes dataset.
+        train_scenes (list[str]): Basic information of training scenes.
+        val_scenes (list[str]): Basic information of validation scenes.
+        test (bool, optional): Whether use the test mode. In test mode, no
+            annotations can be accessed. Default: False.
+        max_sweeps (int, optional): Max number of sweeps. Default: 10.
+
+    Returns:
+        tuple[list[dict]]: Information of training set and validation set
+            that will be saved to the info file.
+    """
+    train_nusc_infos = []
+    val_nusc_infos = []
+
+    for sample in mmcv.track_iter_progress(nusc.sample):
+        lidar_token = sample["data"]["LIDAR_TOP"]
+        sd_rec = nusc.get("sample_data", sample["data"]["LIDAR_TOP"])
+        cs_record = nusc.get(
+            "calibrated_sensor", sd_rec["calibrated_sensor_token"]
+        )
+        pose_record = nusc.get("ego_pose", sd_rec["ego_pose_token"])
+        lidar_path, boxes, _ = nusc.get_sample_data(lidar_token)
+
+        mmcv.check_file_exist(lidar_path)
+
+        info = {
+            "lidar_path": lidar_path,
+            "token": sample["token"],
+            "sweeps": [],
+            "cams": dict(),
+            "lidar2ego_translation": cs_record["translation"],
+            "lidar2ego_rotation": cs_record["rotation"],
+            "ego2global_translation": pose_record["translation"],
+            "ego2global_rotation": pose_record["rotation"],
+            "timestamp": sample["timestamp"],
+        }
+
+        l2e_r = info["lidar2ego_rotation"]
+        l2e_t = info["lidar2ego_translation"]
+        e2g_r = info["ego2global_rotation"]
+        e2g_t = info["ego2global_translation"]
+        l2e_r_mat = Quaternion(l2e_r).rotation_matrix
+        e2g_r_mat = Quaternion(e2g_r).rotation_matrix
+
+        # obtain 6 image's information per frame
+        camera_types = [
+            "CAM_FRONT",
+            "CAM_FRONT_RIGHT",
+            "CAM_FRONT_LEFT",
+            "CAM_BACK",
+            "CAM_BACK_LEFT",
+            "CAM_BACK_RIGHT",
+        ]
+        for cam in camera_types:
+            cam_token = sample["data"][cam]
+            cam_path, _, cam_intrinsic = nusc.get_sample_data(cam_token)
+            cam_info = obtain_sensor2top(
+                nusc, cam_token, l2e_t, l2e_r_mat, e2g_t, e2g_r_mat, cam
+            )
+            cam_info.update(cam_intrinsic=cam_intrinsic)
+            info["cams"].update({cam: cam_info})
+
+        # obtain sweeps for a single key-frame
+        sd_rec = nusc.get("sample_data", sample["data"]["LIDAR_TOP"])
+        sweeps = []
+        while len(sweeps) < max_sweeps:
+            if not sd_rec["prev"] == "":
+                sweep = obtain_sensor2top(
+                    nusc,
+                    sd_rec["prev"],
+                    l2e_t,
+                    l2e_r_mat,
+                    e2g_t,
+                    e2g_r_mat,
+                    "lidar",
+                )
+                sweeps.append(sweep)
+                sd_rec = nusc.get("sample_data", sd_rec["prev"])
+            else:
+                break
+        info["sweeps"] = sweeps
+        # obtain annotation
+        if not test:
+            annotations = [
+                nusc.get("sample_annotation", token)
+                for token in sample["anns"]
+            ]
+            locs = np.array([b.center for b in boxes]).reshape(-1, 3)
+            dims = np.array([b.wlh for b in boxes]).reshape(-1, 3)
+            rots = np.array(
+                [b.orientation.yaw_pitch_roll[0] for b in boxes]
+            ).reshape(-1, 1)
+            velocity = np.array(
+                [nusc.box_velocity(token)[:2] for token in sample["anns"]]
+            )
+            valid_flag = np.array(
+                [
+                    (anno["num_lidar_pts"] + anno["num_radar_pts"]) > 0
+                    for anno in annotations
+                ],
+                dtype=bool,
+            ).reshape(-1)
+            # convert velo from global to lidar
+            for i in range(len(boxes)):
+                velo = np.array([*velocity[i], 0.0])
+                velo = (
+                    velo
+                    @ np.linalg.inv(e2g_r_mat).T
+                    @ np.linalg.inv(l2e_r_mat).T
+                )
+                velocity[i] = velo[:2]
+
+            names = [b.name for b in boxes]
+            for i in range(len(names)):
+                if names[i] in NameMapping:
+                    names[i] = NameMapping[names[i]]
+            names = np.array(names)
+            # we need to convert box size to
+            # the format of our lidar coordinate system
+            # which is x_size, y_size, z_size (corresponding to l, w, h)
+            gt_boxes = np.concatenate([locs, dims[:, [1, 0, 2]], rots], axis=1)
+            assert len(gt_boxes) == len(
+                annotations
+            ), f"{len(gt_boxes)}, {len(annotations)}"
+            info["instance_inds"] = np.array(
+                [
+                    nusc.getind("instance", x["instance_token"])
+                    for x in annotations
+                ]
+            )
+            info["gt_boxes"] = gt_boxes
+            info["gt_names"] = names
+            info["gt_velocity"] = velocity.reshape(-1, 2)
+            info["num_lidar_pts"] = np.array(
+                [a["num_lidar_pts"] for a in annotations]
+            )
+            info["num_radar_pts"] = np.array(
+                [a["num_radar_pts"] for a in annotations]
+            )
+            info["valid_flag"] = valid_flag
+
+        if sample["scene_token"] in train_scenes:
+            train_nusc_infos.append(info)
+        else:
+            val_nusc_infos.append(info)
+
+    return train_nusc_infos, val_nusc_infos
+
+
+def obtain_sensor2top(
+    nusc, sensor_token, l2e_t, l2e_r_mat, e2g_t, e2g_r_mat, sensor_type="lidar"
+):
+    """Obtain the info with RT matric from general sensor to Top LiDAR.
+
+    Args:
+        nusc (class): Dataset class in the nuScenes dataset.
+        sensor_token (str): Sample data token corresponding to the
+            specific sensor type.
+        l2e_t (np.ndarray): Translation from lidar to ego in shape (1, 3).
+        l2e_r_mat (np.ndarray): Rotation matrix from lidar to ego
+            in shape (3, 3).
+        e2g_t (np.ndarray): Translation from ego to global in shape (1, 3).
+        e2g_r_mat (np.ndarray): Rotation matrix from ego to global
+            in shape (3, 3).
+        sensor_type (str, optional): Sensor to calibrate. Default: 'lidar'.
+
+    Returns:
+        sweep (dict): Sweep information after transformation.
+    """
+    sd_rec = nusc.get("sample_data", sensor_token)
+    cs_record = nusc.get(
+        "calibrated_sensor", sd_rec["calibrated_sensor_token"]
+    )
+    pose_record = nusc.get("ego_pose", sd_rec["ego_pose_token"])
+    data_path = str(nusc.get_sample_data_path(sd_rec["token"]))
+    if os.getcwd() in data_path:  # path from lyftdataset is absolute path
+        data_path = data_path.split(f"{os.getcwd()}/")[-1]  # relative path
+    sweep = {
+        "data_path": data_path,
+        "type": sensor_type,
+        "sample_data_token": sd_rec["token"],
+        "sensor2ego_translation": cs_record["translation"],
+        "sensor2ego_rotation": cs_record["rotation"],
+        "ego2global_translation": pose_record["translation"],
+        "ego2global_rotation": pose_record["rotation"],
+        "timestamp": sd_rec["timestamp"],
+    }
+    l2e_r_s = sweep["sensor2ego_rotation"]
+    l2e_t_s = sweep["sensor2ego_translation"]
+    e2g_r_s = sweep["ego2global_rotation"]
+    e2g_t_s = sweep["ego2global_translation"]
+
+    # obtain the RT from sensor to Top LiDAR
+    # sweep->ego->global->ego'->lidar
+    l2e_r_s_mat = Quaternion(l2e_r_s).rotation_matrix
+    e2g_r_s_mat = Quaternion(e2g_r_s).rotation_matrix
+    R = (l2e_r_s_mat.T @ e2g_r_s_mat.T) @ (
+        np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T
+    )
+    T = (l2e_t_s @ e2g_r_s_mat.T + e2g_t_s) @ (
+        np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T
+    )
+    T -= (
+        e2g_t @ (np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T)
+        + l2e_t @ np.linalg.inv(l2e_r_mat).T
+    )
+    sweep["sensor2lidar_rotation"] = R.T  # points @ R.T + T
+    sweep["sensor2lidar_translation"] = T
+    return sweep
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(description="nuscenes converter")
+    parser.add_argument("--root_path", type=str, default="./data/nuscenes")
+    parser.add_argument("--info_prefix", type=str, default="nuscenes")
+    parser.add_argument("--version", type=str, default="v1.0-trainval,v1.0-test")
+    parser.add_argument("--max_sweeps", type=int, default=10)
+    args = parser.parse_args()
+
+    versions = args.version.split(",")
+    for version in versions:
+        create_nuscenes_infos(
+            args.root_path,
+            args.info_prefix,
+            version=version,
+            max_sweeps=args.max_sweeps,
+        )
--- a/tools/test.py
+++ b/tools/test.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import mmcv
+import os
+import torch
+import warnings
+from mmcv import Config, DictAction
+from mmcv.cnn import fuse_conv_bn
+from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
+from mmcv.runner import (
+    get_dist_info,
+    init_dist,
+    load_checkpoint,
+    wrap_fp16_model,
+)
+
+from mmdet.apis import single_gpu_test, multi_gpu_test, set_random_seed
+from mmdet.datasets import replace_ImageToTensor, build_dataset
+from mmdet.datasets import build_dataloader as build_dataloader_origin
+from mmdet.models import build_detector
+
+from projects.mmdet3d_plugin.datasets.builder import build_dataloader
+from projects.mmdet3d_plugin.apis.test import custom_multi_gpu_test
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="MMDet test (and eval) a model"
+    )
+    parser.add_argument("config", help="test config file path")
+    parser.add_argument("checkpoint", help="checkpoint file")
+    parser.add_argument("--out", help="output result file in pickle format")
+    parser.add_argument(
+        "--fuse-conv-bn",
+        action="store_true",
+        help="Whether to fuse conv and bn, this will slightly increase"
+        "the inference speed",
+    )
+    parser.add_argument(
+        "--format-only",
+        action="store_true",
+        help="Format the output results without perform evaluation. It is"
+        "useful when you want to format the result to a specific format and "
+        "submit it to the test server",
+    )
+    parser.add_argument(
+        "--eval",
+        type=str,
+        nargs="+",
+        help='evaluation metrics, which depends on the dataset, e.g., "bbox",'
+        ' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC',
+    )
+    parser.add_argument("--show", action="store_true", help="show results")
+    parser.add_argument(
+        "--show-dir", help="directory where results will be saved"
+    )
+    parser.add_argument(
+        "--gpu-collect",
+        action="store_true",
+        help="whether to use gpu to collect results.",
+    )
+    parser.add_argument(
+        "--tmpdir",
+        help="tmp directory used for collecting results from multiple "
+        "workers, available when gpu-collect is not specified",
+    )
+    parser.add_argument("--seed", type=int, default=0, help="random seed")
+    parser.add_argument(
+        "--deterministic",
+        action="store_true",
+        help="whether to set deterministic options for CUDNN backend.",
+    )
+    parser.add_argument(
+        "--cfg-options",
+        nargs="+",
+        action=DictAction,
+        help="override some settings in the used config, the key-value pair "
+        "in xxx=yyy format will be merged into config file. If the value to "
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        "Note that the quotation marks are necessary and that no white space "
+        "is allowed.",
+    )
+    parser.add_argument(
+        "--options",
+        nargs="+",
+        action=DictAction,
+        help="custom options for evaluation, the key-value pair in xxx=yyy "
+        "format will be kwargs for dataset.evaluate() function (deprecate), "
+        "change to --eval-options instead.",
+    )
+    parser.add_argument(
+        "--eval-options",
+        nargs="+",
+        action=DictAction,
+        help="custom options for evaluation, the key-value pair in xxx=yyy "
+        "format will be kwargs for dataset.evaluate() function",
+    )
+    parser.add_argument(
+        "--launcher",
+        choices=["none", "pytorch", "slurm", "mpi"],
+        default="none",
+        help="job launcher",
+    )
+    parser.add_argument("--local_rank", type=int, default=0)
+    parser.add_argument("--result_file", type=str, default=None)
+    parser.add_argument("--show_only", action="store_true")
+    args = parser.parse_args()
+    if "LOCAL_RANK" not in os.environ:
+        os.environ["LOCAL_RANK"] = str(args.local_rank)
+
+    if args.options and args.eval_options:
+        raise ValueError(
+            "--options and --eval-options cannot be both specified, "
+            "--options is deprecated in favor of --eval-options"
+        )
+    if args.options:
+        warnings.warn("--options is deprecated in favor of --eval-options")
+        args.eval_options = args.options
+    return args
+
+
+def main():
+    args = parse_args()
+
+    assert (
+        args.out or args.eval or args.format_only or args.show or args.show_dir
+    ), (
+        "Please specify at least one operation (save/eval/format/show the "
+        'results / save the results) with the argument "--out", "--eval"'
+        ', "--format-only", "--show" or "--show-dir"'
+    )
+
+    if args.eval and args.format_only:
+        raise ValueError("--eval and --format_only cannot be both specified")
+
+    if args.out is not None and not args.out.endswith((".pkl", ".pickle")):
+        raise ValueError("The output file must be a pkl file.")
+
+    cfg = Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+    # import modules from string list.
+    if cfg.get("custom_imports", None):
+        from mmcv.utils import import_modules_from_strings
+
+        import_modules_from_strings(**cfg["custom_imports"])
+
+    # import modules from plguin/xx, registry will be updated
+    if hasattr(cfg, "plugin"):
+        if cfg.plugin:
+            import importlib
+
+            if hasattr(cfg, "plugin_dir"):
+                plugin_dir = cfg.plugin_dir
+                _module_dir = os.path.dirname(plugin_dir)
+                _module_dir = _module_dir.split("/")
+                _module_path = _module_dir[0]
+
+                for m in _module_dir[1:]:
+                    _module_path = _module_path + "." + m
+                print(_module_path)
+                plg_lib = importlib.import_module(_module_path)
+            else:
+                # import dir is the dirpath for the config file
+                _module_dir = os.path.dirname(args.config)
+                _module_dir = _module_dir.split("/")
+                _module_path = _module_dir[0]
+                for m in _module_dir[1:]:
+                    _module_path = _module_path + "." + m
+                print(_module_path)
+                plg_lib = importlib.import_module(_module_path)
+
+    # set cudnn_benchmark
+    if cfg.get("cudnn_benchmark", False):
+        torch.backends.cudnn.benchmark = True
+
+    cfg.model.pretrained = None
+    # in case the test dataset is concatenated
+    samples_per_gpu = 1
+    if isinstance(cfg.data.test, dict):
+        cfg.data.test.test_mode = True
+        samples_per_gpu = cfg.data.test.pop("samples_per_gpu", 1)
+        if samples_per_gpu > 1:
+            # Replace 'ImageToTensor' to 'DefaultFormatBundle'
+            cfg.data.test.pipeline = replace_ImageToTensor(
+                cfg.data.test.pipeline
+            )
+    elif isinstance(cfg.data.test, list):
+        for ds_cfg in cfg.data.test:
+            ds_cfg.test_mode = True
+        samples_per_gpu = max(
+            [ds_cfg.pop("samples_per_gpu", 1) for ds_cfg in cfg.data.test]
+        )
+        if samples_per_gpu > 1:
+            for ds_cfg in cfg.data.test:
+                ds_cfg.pipeline = replace_ImageToTensor(ds_cfg.pipeline)
+
+    # init distributed env first, since logger depends on the dist info.
+    if args.launcher == "none":
+        distributed = False
+    else:
+        distributed = True
+        init_dist(args.launcher, **cfg.dist_params)
+
+    # set random seeds
+    if args.seed is not None:
+        set_random_seed(args.seed, deterministic=args.deterministic)
+
+    # build the dataloader
+    dataset = build_dataset(cfg.data.test)
+    print("distributed:", distributed)
+    if distributed:
+        data_loader = build_dataloader(
+            dataset,
+            samples_per_gpu=samples_per_gpu,
+            workers_per_gpu=cfg.data.workers_per_gpu,
+            dist=distributed,
+            shuffle=False,
+            nonshuffler_sampler=dict(type="DistributedSampler"),
+        )
+    else:
+        data_loader = build_dataloader_origin(
+            dataset,
+            samples_per_gpu=samples_per_gpu,
+            workers_per_gpu=cfg.data.workers_per_gpu,
+            dist=distributed,
+            shuffle=False,
+        )
+
+    # build the model and load checkpoint
+    cfg.model.train_cfg = None
+    model = build_detector(cfg.model, test_cfg=cfg.get("test_cfg"))
+    # model = build_model(cfg.model, test_cfg=cfg.get("test_cfg"))
+    fp16_cfg = cfg.get("fp16", None)
+    if fp16_cfg is not None:
+        wrap_fp16_model(model)
+    checkpoint = load_checkpoint(model, args.checkpoint, map_location="cpu")
+    if args.fuse_conv_bn:
+        model = fuse_conv_bn(model)
+    # old versions did not save class info in checkpoints, this walkaround is
+    # for backward compatibility
+    if "CLASSES" in checkpoint.get("meta", {}):
+        model.CLASSES = checkpoint["meta"]["CLASSES"]
+    else:
+        model.CLASSES = dataset.CLASSES
+    # palette for visualization in segmentation tasks
+    if "PALETTE" in checkpoint.get("meta", {}):
+        model.PALETTE = checkpoint["meta"]["PALETTE"]
+    elif hasattr(dataset, "PALETTE"):
+        # segmentation dataset has `PALETTE` attribute
+        model.PALETTE = dataset.PALETTE
+
+    if args.result_file is not None:
+        outputs = torch.load(args.result_file)
+    elif not distributed:
+        model = MMDataParallel(model, device_ids=[0])
+        outputs = single_gpu_test(model, data_loader, args.show, args.show_dir)
+    else:
+        model = MMDistributedDataParallel(
+            model.cuda(),
+            device_ids=[torch.cuda.current_device()],
+            broadcast_buffers=False,
+        )
+        outputs = custom_multi_gpu_test(
+            model, data_loader, args.tmpdir, args.gpu_collect
+        )
+
+    rank, _ = get_dist_info()
+    if rank == 0:
+        if args.out:
+            print(f"\nwriting results to {args.out}")
+            mmcv.dump(outputs, args.out)
+        kwargs = {} if args.eval_options is None else args.eval_options
+        if args.show_only:
+            eval_kwargs = cfg.get("evaluation", {}).copy()
+            # hard-code way to remove EvalHook args
+            for key in [
+                "interval",
+                "tmpdir",
+                "start",
+                "gpu_collect",
+                "save_best",
+                "rule",
+            ]:
+                eval_kwargs.pop(key, None)
+            eval_kwargs.update(kwargs)
+            dataset.show(outputs, show=True, **eval_kwargs)
+        elif args.format_only:
+            dataset.format_results(outputs, **kwargs)
+        elif args.eval:
+            eval_kwargs = cfg.get("evaluation", {}).copy()
+            # hard-code way to remove EvalHook args
+            for key in [
+                "interval",
+                "tmpdir",
+                "start",
+                "gpu_collect",
+                "save_best",
+                "rule",
+            ]:
+                eval_kwargs.pop(key, None)
+            eval_kwargs.update(dict(metric=args.eval, **kwargs))
+            print(eval_kwargs)
+            results_dict = dataset.evaluate(outputs, **eval_kwargs)
+            print(results_dict)
+
+
+if __name__ == "__main__":
+    torch.multiprocessing.set_start_method(
+        "fork"
+    )  # use fork workers_per_gpu can be > 1
+    main()
--- a/tools/train.py
+++ b/tools/train.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from __future__ import division
+import sys
+import os
+
+print(sys.executable, os.path.abspath(__file__))
+# import init_paths # for conda pkgs submitting method
+import argparse
+import copy
+import mmcv
+import time
+import torch
+import warnings
+from mmcv import Config, DictAction
+from mmcv.runner import get_dist_info, init_dist
+from os import path as osp
+
+from mmdet import __version__ as mmdet_version
+from mmdet.apis import train_detector
+from mmdet.datasets import build_dataset
+from mmdet.models import build_detector
+from mmdet.utils import collect_env, get_root_logger
+from mmdet.apis import set_random_seed
+from torch import distributed as dist
+from datetime import timedelta
+
+import cv2
+
+cv2.setNumThreads(8)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Train a detector")
+    parser.add_argument("config", help="train config file path")
+    parser.add_argument("--work-dir", help="the dir to save logs and models")
+    parser.add_argument(
+        "--resume-from", help="the checkpoint file to resume from"
+    )
+    parser.add_argument(
+        "--no-validate",
+        action="store_true",
+        help="whether not to evaluate the checkpoint during training",
+    )
+    group_gpus = parser.add_mutually_exclusive_group()
+    group_gpus.add_argument(
+        "--gpus",
+        type=int,
+        help="number of gpus to use "
+        "(only applicable to non-distributed training)",
+    )
+    group_gpus.add_argument(
+        "--gpu-ids",
+        type=int,
+        nargs="+",
+        help="ids of gpus to use "
+        "(only applicable to non-distributed training)",
+    )
+    parser.add_argument("--seed", type=int, default=0, help="random seed")
+    parser.add_argument(
+        "--deterministic",
+        action="store_true",
+        help="whether to set deterministic options for CUDNN backend.",
+    )
+    parser.add_argument(
+        "--options",
+        nargs="+",
+        action=DictAction,
+        help="override some settings in the used config, the key-value pair "
+        "in xxx=yyy format will be merged into config file (deprecate), "
+        "change to --cfg-options instead.",
+    )
+    parser.add_argument(
+        "--cfg-options",
+        nargs="+",
+        action=DictAction,
+        help="override some settings in the used config, the key-value pair "
+        "in xxx=yyy format will be merged into config file. If the value to "
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        "Note that the quotation marks are necessary and that no white space "
+        "is allowed.",
+    )
+    parser.add_argument(
+        "--dist-url",
+        type=str,
+        default="auto",
+        help="dist url for init process, such as tcp://localhost:8000",
+    )
+    parser.add_argument("--gpus-per-machine", type=int, default=8)
+    parser.add_argument(
+        "--launcher",
+        choices=["none", "pytorch", "slurm", "mpi", "mpi_nccl"],
+        default="none",
+        help="job launcher",
+    )
+    parser.add_argument("--local_rank", type=int, default=0)
+    parser.add_argument(
+        "--autoscale-lr",
+        action="store_true",
+        help="automatically scale lr with the number of gpus",
+    )
+    args = parser.parse_args()
+    if "LOCAL_RANK" not in os.environ:
+        os.environ["LOCAL_RANK"] = str(args.local_rank)
+
+    if args.options and args.cfg_options:
+        raise ValueError(
+            "--options and --cfg-options cannot be both specified, "
+            "--options is deprecated in favor of --cfg-options"
+        )
+    if args.options:
+        warnings.warn("--options is deprecated in favor of --cfg-options")
+        args.cfg_options = args.options
+
+    return args
+
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+    # import modules from string list.
+    if cfg.get("custom_imports", None):
+        from mmcv.utils import import_modules_from_strings
+
+        import_modules_from_strings(**cfg["custom_imports"])
+
+    # import modules from plguin/xx, registry will be updated
+    if hasattr(cfg, "plugin"):
+        if cfg.plugin:
+            import importlib
+
+            if hasattr(cfg, "plugin_dir"):
+                plugin_dir = cfg.plugin_dir
+                _module_dir = os.path.dirname(plugin_dir)
+                _module_dir = _module_dir.split("/")
+                _module_path = _module_dir[0]
+
+                for m in _module_dir[1:]:
+                    _module_path = _module_path + "." + m
+                print(_module_path)
+                plg_lib = importlib.import_module(_module_path)
+            else:
+                # import dir is the dirpath for the config file
+                _module_dir = os.path.dirname(args.config)
+                _module_dir = _module_dir.split("/")
+                _module_path = _module_dir[0]
+                for m in _module_dir[1:]:
+                    _module_path = _module_path + "." + m
+                print(_module_path)
+                plg_lib = importlib.import_module(_module_path)
+            from projects.mmdet3d_plugin.apis.train import custom_train_model
+
+    # set cudnn_benchmark
+    if cfg.get("cudnn_benchmark", False):
+        torch.backends.cudnn.benchmark = True
+
+    # work_dir is determined in this priority: CLI > segment in file > filename
+    if args.work_dir is not None:
+        # update configs according to CLI args if args.work_dir is not None
+        cfg.work_dir = args.work_dir
+    elif cfg.get("work_dir", None) is None:
+        # use config filename as default work_dir if cfg.work_dir is None
+        cfg.work_dir = osp.join(
+            "./work_dirs", osp.splitext(osp.basename(args.config))[0]
+        )
+    if args.resume_from is not None:
+        cfg.resume_from = args.resume_from
+    if args.gpu_ids is not None:
+        cfg.gpu_ids = args.gpu_ids
+    else:
+        cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus)
+
+    if args.autoscale_lr:
+        # apply the linear scaling rule (https://arxiv.org/abs/1706.02677)
+        cfg.optimizer["lr"] = cfg.optimizer["lr"] * len(cfg.gpu_ids) / 8
+
+    # init distributed env first, since logger depends on the dist info.
+    print("args.launcher:",args.launcher)
+    if args.launcher == "none":
+        distributed = False
+        print("cfg.gpu_ids_none:", cfg.gpu_ids)
+    elif args.launcher == "mpi_nccl":
+        distributed = True
+
+        import mpi4py.MPI as MPI
+
+        comm = MPI.COMM_WORLD
+        mpi_local_rank = comm.Get_rank()
+        mpi_world_size = comm.Get_size()
+        print(
+            "MPI local_rank=%d, world_size=%d"
+            % (mpi_local_rank, mpi_world_size)
+        )
+
+        # num_gpus = torch.cuda.device_count()
+        device_ids_on_machines = list(range(args.gpus_per_machine))
+        str_ids = list(map(str, device_ids_on_machines))
+        os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(str_ids)
+        torch.cuda.set_device(mpi_local_rank % args.gpus_per_machine)
+
+        dist.init_process_group(
+            backend="nccl",
+            init_method=args.dist_url,
+            world_size=mpi_world_size,
+            rank=mpi_local_rank,
+            timeout=timedelta(seconds=3600),
+        )
+
+        cfg.gpu_ids = range(mpi_world_size)
+        print("cfg.gpu_ids_mpi_nccl:", cfg.gpu_ids)
+    else:
+        distributed = True
+        init_dist(
+            args.launcher, timeout=timedelta(seconds=3600), **cfg.dist_params
+        )
+        # re-set gpu_ids with distributed training mode
+        _, world_size = get_dist_info()
+        cfg.gpu_ids = range(world_size)
+        print("cfg.gpu_ids_others:", cfg.gpu_ids)
+
+
+    # create work_dir
+    mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))
+    # dump config
+    cfg.dump(osp.join(cfg.work_dir, osp.basename(args.config)))
+    # init the logger before other steps
+    timestamp = time.strftime("%Y%m%d_%H%M%S", time.localtime())
+    log_file = osp.join(cfg.work_dir, f"{timestamp}.log")
+    # specify logger name, if we still use 'mmdet', the output info will be
+    # filtered and won't be saved in the log_file
+    # TODO: ugly workaround to judge whether we are training det or seg model
+    logger = get_root_logger(
+        log_file=log_file, log_level=cfg.log_level
+    )
+
+    # init the meta dict to record some important information such as
+    # environment info and seed, which will be logged
+    meta = dict()
+    # log env info
+    env_info_dict = collect_env()
+    env_info = "\n".join([(f"{k}: {v}") for k, v in env_info_dict.items()])
+    dash_line = "-" * 60 + "\n"
+    logger.info(
+        "Environment info:\n" + dash_line + env_info + "\n" + dash_line
+    )
+    meta["env_info"] = env_info
+    meta["config"] = cfg.pretty_text
+
+    # log some basic info
+    logger.info(f"Distributed training: {distributed}")
+    logger.info(f"Config:\n{cfg.pretty_text}")
+
+    # set random seeds
+    if args.seed is not None:
+        logger.info(
+            f"Set random seed to {args.seed}, "
+            f"deterministic: {args.deterministic}"
+        )
+        set_random_seed(args.seed, deterministic=args.deterministic)
+    cfg.seed = args.seed
+    meta["seed"] = args.seed
+    meta["exp_name"] = osp.basename(args.config)
+
+    model = build_detector(
+        cfg.model, train_cfg=cfg.get("train_cfg"), test_cfg=cfg.get("test_cfg")
+    )
+    model.init_weights()
+
+    logger.info(f"Model:\n{model}")
+    datasets = [build_dataset(cfg.data.train)]
+    if len(cfg.workflow) == 2:
+        val_dataset = copy.deepcopy(cfg.data.val)
+        # in case we use a dataset wrapper
+        if "dataset" in cfg.data.train:
+            val_dataset.pipeline = cfg.data.train.dataset.pipeline
+        else:
+            val_dataset.pipeline = cfg.data.train.pipeline
+        # set test_mode=False here in deep copied config
+        # which do not affect AP/AR calculation later
+        # refer to https://mmdetection3d.readthedocs.io/en/latest/tutorials/customize_runtime.html#customize-workflow  # noqa
+        val_dataset.test_mode = False
+        datasets.append(build_dataset(val_dataset))
+    if cfg.checkpoint_config is not None:
+        # save mmdet version, config file content and class names in
+        # checkpoints as meta data
+        cfg.checkpoint_config.meta = dict(
+            mmdet_version=mmdet_version,
+            config=cfg.pretty_text,
+            CLASSES=datasets[0].CLASSES,
+        )
+    # add an attribute for visualization convenience
+    model.CLASSES = datasets[0].CLASSES
+
+    torch.backends.cudnn.benchmark = True  # 启用自动寻找最优卷积算法
+    if hasattr(cfg, "plugin"):
+        custom_train_model(
+            model,
+            datasets,
+            cfg,
+            distributed=distributed,
+            validate=(not args.no_validate),
+            timestamp=timestamp,
+            meta=meta,
+        )
+    else:
+        train_detector(
+            model,
+            datasets,
+            cfg,
+            distributed=distributed,
+            validate=(not args.no_validate),
+            timestamp=timestamp,
+            meta=meta,
+        )
+
+
+if __name__ == "__main__":
+    torch.multiprocessing.set_start_method(
+        "fork"
+    )  # use fork workers_per_gpu can be > 1
+    main()
--- a/tutorial/tutorial.ipynb
+++ b/tutorial/tutorial.ipynb