init

7c0c60e3 · change3n8 · 7c0c60e3 · 7c0c60e3 · 7c0c60e3 · 7c0c60e3
Commit 7c0c60e3 authored Feb 27, 2026 by change3n8
12 changed files
--- a/projects/mmdet3d_plugin/ops/deformable_aggregation_ext.egg-info/PKG-INFO
+++ b/projects/mmdet3d_plugin/ops/deformable_aggregation_ext.egg-info/PKG-INFO
+Metadata-Version: 2.1
+Name: deformable-aggregation-ext
+Version: 0.0.0
--- a/projects/mmdet3d_plugin/ops/deformable_aggregation_ext.egg-info/SOURCES.txt
+++ b/projects/mmdet3d_plugin/ops/deformable_aggregation_ext.egg-info/SOURCES.txt
+setup.py
+deformable_aggregation_ext.egg-info/PKG-INFO
+deformable_aggregation_ext.egg-info/SOURCES.txt
+deformable_aggregation_ext.egg-info/dependency_links.txt
+deformable_aggregation_ext.egg-info/top_level.txt
+src/deformable_aggregation.cpp
+src/deformable_aggregation_cuda.cu
+src/deformable_aggregation_cuda.hip
+src/deformable_aggregation_hip.cpp
\ No newline at end of file
--- a/projects/mmdet3d_plugin/ops/deformable_aggregation_ext.egg-info/dependency_links.txt
+++ b/projects/mmdet3d_plugin/ops/deformable_aggregation_ext.egg-info/dependency_links.txt
+
--- a/projects/mmdet3d_plugin/ops/deformable_aggregation_ext.egg-info/top_level.txt
+++ b/projects/mmdet3d_plugin/ops/deformable_aggregation_ext.egg-info/top_level.txt
+
--- a/projects/mmdet3d_plugin/ops/deformable_aggregation_ext/__init__.py
+++ b/projects/mmdet3d_plugin/ops/deformable_aggregation_ext/__init__.py
--- a/projects/mmdet3d_plugin/ops/deformable_aggregation_ext/deformable_aggregation_ext.cpython-310-x86_64-linux-gnu.so
+++ b/projects/mmdet3d_plugin/ops/deformable_aggregation_ext/deformable_aggregation_ext.cpython-310-x86_64-linux-gnu.so
--- a/projects/mmdet3d_plugin/ops/deformable_aggregation_ext/setup.py
+++ b/projects/mmdet3d_plugin/ops/deformable_aggregation_ext/setup.py
+# ops/setup.py
+import os
+import torch
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CppExtension, CUDAExtension
+
+def make_cuda_ext(name, module, sources, sources_cuda=[], extra_args=[]):
+    define_macros = []
+    extra_compile_args = {"cxx": [] + extra_args}
+
+    if torch.cuda.is_available() or os.getenv("FORCE_CUDA", "0") == "1":
+        define_macros += [("WITH_CUDA", None)]
+        extension = CUDAExtension
+        extra_compile_args["nvcc"] = extra_args + [
+            "-D__CUDA_NO_HALF_OPERATORS__",
+            "-D__CUDA_NO_HALF_CONVERSIONS__",
+            "-D__CUDA_NO_HALF2_OPERATORS__",
+        ]
+        sources += sources_cuda
+    else:
+        print(f"Compiling {name} without CUDA")
+        extension = CppExtension
+
+    return extension(
+        name=f"{module}.{name}",
+        sources=[os.path.join(module.replace(".", "/"), "src", p) for p in sources],
+        include_dirs=[os.path.join(module.replace(".", "/"), "src")],
+        define_macros=define_macros,
+        extra_compile_args=extra_compile_args,
+    )
+
+setup(
+    name="deformable_aggregation_ext",
+    packages=["deformable_aggregation_ext"],   # 指定包名
+    ext_modules=[
+        make_cuda_ext(
+            name="deformable_aggregation_ext",
+            module="deformable_aggregation_ext",  # 指向包名
+            sources=[
+                "deformable_aggregation.cpp",
+                "deformable_aggregation_cuda.cu"
+            ],
+        )
+    ],
+    cmdclass={"build_ext": BuildExtension},
+)
\ No newline at end of file
--- a/projects/mmdet3d_plugin/ops/deformable_aggregation_ext/src/deformable_aggregation.cpp
+++ b/projects/mmdet3d_plugin/ops/deformable_aggregation_ext/src/deformable_aggregation.cpp
+      
+#include <torch/extension.h>
+#include <c10/cuda/CUDAGuard.h>
+
+void deformable_aggregation(
+  float* output,
+  const float* mc_ms_feat,
+  const int* spatial_shape,
+  const int* scale_start_index,
+  const float* sample_location,
+  const float* weights,
+  int batch_size,
+  int num_cams,
+  int num_feat,
+  int num_embeds,
+  int num_scale,
+  int num_anchors,
+  int num_pts,
+  int num_groups
+);
+
+void deformable_aggregation_ref(
+  float* output,
+  const float* mc_ms_feat,
+  const int* spatial_shape,
+  const int* scale_start_index,
+  const float* sample_location,
+  const float* weights,
+  int batch_size,
+  int num_cams,
+  int num_feat,
+  int num_embeds,
+  int num_scale,
+  int num_anchors,
+  int num_pts,
+  int num_groups
+);  
+
+at::Tensor deformable_aggregation_forward(
+  const at::Tensor &_mc_ms_feat,
+  const at::Tensor &_spatial_shape,
+  const at::Tensor &_scale_start_index,
+  const at::Tensor &_sampling_location,
+  const at::Tensor &_weights
+) {
+  at::DeviceGuard guard(_mc_ms_feat.device());
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(_mc_ms_feat));
+  int batch_size = _mc_ms_feat.size(0);
+  int num_feat = _mc_ms_feat.size(1);
+  int num_embeds = _mc_ms_feat.size(2);
+  int num_cams = _spatial_shape.size(0);
+  int num_scale = _spatial_shape.size(1);
+  int num_anchors = _sampling_location.size(1);
+  int num_pts = _sampling_location.size(2);
+  int num_groups = _weights.size(5);
+
+  const float* mc_ms_feat = _mc_ms_feat.data_ptr<float>();
+  const int* spatial_shape = _spatial_shape.data_ptr<int>();
+  const int* scale_start_index = _scale_start_index.data_ptr<int>();
+  const float* sampling_location = _sampling_location.data_ptr<float>();
+  const float* weights = _weights.data_ptr<float>();
+
+  auto output = at::zeros({batch_size, num_anchors, num_embeds}, _mc_ms_feat.options());
+  int warm_up = 10;
+  int prof_cnt = 1;
+  cudaEvent_t start, stop;
+  float milliseconds = 0;
+  for (int _i = 0; _i < warm_up; _i++) {
+    deformable_aggregation(
+      output.data_ptr<float>(),
+      mc_ms_feat, spatial_shape, scale_start_index, sampling_location, weights,
+      batch_size, num_cams, num_feat, num_embeds, num_scale, num_anchors, num_pts, num_groups
+    );
+  }
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop); 
+
+  cudaEventRecord(start);
+  for (int _i = 0; _i < prof_cnt; _i++) {
+    deformable_aggregation(
+      output.data_ptr<float>(),
+      mc_ms_feat, spatial_shape, scale_start_index, sampling_location, weights,
+      batch_size, num_cams, num_feat, num_embeds, num_scale, num_anchors, num_pts, num_groups
+    );
+  }
+  cudaEventRecord(stop);
+
+  cudaEventSynchronize(stop);
+
+  cudaEventElapsedTime(&milliseconds, start, stop);
+
+  printf("优化后Kernel 执行时间: %.3f 毫秒\n", milliseconds / prof_cnt);
+
+  cudaEventDestroy(start);
+  cudaEventDestroy(stop);
+
+  return output;
+}
+
+at::Tensor deformable_aggregation_forward_ref(
+  const at::Tensor &_mc_ms_feat,
+  const at::Tensor &_spatial_shape,
+  const at::Tensor &_scale_start_index,
+  const at::Tensor &_sampling_location,
+  const at::Tensor &_weights
+) {
+  at::DeviceGuard guard(_mc_ms_feat.device());
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(_mc_ms_feat));
+  int batch_size = _mc_ms_feat.size(0);
+  int num_feat = _mc_ms_feat.size(1);
+  int num_embeds = _mc_ms_feat.size(2);
+  int num_cams = _spatial_shape.size(0);
+  int num_scale = _spatial_shape.size(1);
+  int num_anchors = _sampling_location.size(1);
+  int num_pts = _sampling_location.size(2);
+  int num_groups = _weights.size(5);
+
+  const float* mc_ms_feat = _mc_ms_feat.data_ptr<float>();
+  const int* spatial_shape = _spatial_shape.data_ptr<int>();
+  const int* scale_start_index = _scale_start_index.data_ptr<int>();
+  const float* sampling_location = _sampling_location.data_ptr<float>();
+  const float* weights = _weights.data_ptr<float>();
+
+  auto output = at::zeros({batch_size, num_anchors, num_embeds}, _mc_ms_feat.options());
+  int warm_up = 10;
+  int prof_cnt = 1;
+  cudaEvent_t start, stop;
+  float milliseconds = 0;
+
+  for (int _i = 0; _i < warm_up; _i++) {
+  deformable_aggregation_ref(
+    output.data_ptr<float>(),
+    mc_ms_feat, spatial_shape, scale_start_index, sampling_location, weights,
+    batch_size, num_cams, num_feat, num_embeds, num_scale, num_anchors, num_pts, num_groups
+  );
+  }
+
+
+  // 创建事件对象
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop); 
+
+  cudaEventRecord(start);
+  for (int _i = 0; _i < prof_cnt; _i++) {
+    deformable_aggregation_ref(
+      output.data_ptr<float>(),
+      mc_ms_feat, spatial_shape, scale_start_index, sampling_location, weights,
+      batch_size, num_cams, num_feat, num_embeds, num_scale, num_anchors, num_pts, num_groups
+    );
+  }
+
+  cudaEventRecord(stop);
+
+  cudaEventSynchronize(stop);
+
+  cudaEventElapsedTime(&milliseconds, start, stop);
+
+  printf("优化前Kernel 执行时间: %.3f 毫秒\n", milliseconds / prof_cnt);
+
+  cudaEventDestroy(start);
+  cudaEventDestroy(stop);
+
+  return output;
+}
+
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def(
+    "deformable_aggregation_forward",
+    &deformable_aggregation_forward,
+    "deformable_aggregation_forward"
+  );
+  m.def(
+    "deformable_aggregation_forward_ref",
+    &deformable_aggregation_forward_ref,
+    "deformable_aggregation_forward_ref"
+  );
+
+}
+
+    
\ No newline at end of file
--- a/projects/mmdet3d_plugin/ops/deformable_aggregation_ext/src/deformable_aggregation_cuda.cu
+++ b/projects/mmdet3d_plugin/ops/deformable_aggregation_ext/src/deformable_aggregation_cuda.cu
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <THC/THCAtomics.cuh>
+
+#include <iostream>
+#include <stdlib.h>
+
+
+__device__ float bilinear_sampling(
+    const float *&bottom_data, const int &height, const int &width,
+    const int &num_embeds, const float &h_im, const float &w_im,
+    const int &base_ptr
+) {
+  const int h_low = floorf(h_im);
+  const int w_low = floorf(w_im);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const float lh = h_im - h_low;
+  const float lw = w_im - w_low;
+  const float hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = num_embeds;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+
+  float v1 = 0;
+  if (h_low >= 0 && w_low >= 0) {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+  }
+  float v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1) {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+  }
+  float v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0) {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+  }
+  float v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1) {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+  }
+
+  const float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  const float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+struct float2_t{
+  float a;
+  float b;
+};
+
+__forceinline__ __device__
+float2_t warp_reduce_sum(float2_t val, int max = 32) {
+  for (int offset = max; offset > 0; offset >>= 1 ) {
+    val.a += __shfl_down(val.a, offset);
+    val.b += __shfl_down(val.b, offset);
+  }
+  return val;
+}
+
+template <int blocksize>
+__forceinline__ __device__
+float2_t block_reduce_sum(float2_t val, float2_t* shared) {
+  const int lid = threadIdx.x % 64;
+  const int wid = threadIdx.x / 64;
+  constexpr int share_size = blocksize / 64;
+
+  val = warp_reduce_sum(val);
+  
+  if constexpr (blocksize == 64) return val;
+
+  if (lid == 0 && wid < share_size) {
+    shared[wid] = val;
+  }
+
+  __syncthreads();
+
+  if (wid == 0 && lid < share_size) {
+    val = shared[lid];
+    val = warp_reduce_sum(val, share_size / 2);
+  }
+
+  return val;
+
+}
+
+__global__ void deformable_aggregation_kernel_sp(
+    const int64_t num_kernels,
+    float* output,
+    const float* mc_ms_feat,
+    const int* spatial_shape,
+    const int* scale_start_index,
+    const float* sample_location,
+    const float* weights,
+    int batch_size,
+    int num_cams,
+    int num_feat,
+    int num_embeds,
+    int num_scale,
+    int num_anchors,
+    int num_pts,
+    int num_groups
+) {
+    int64_t block_id = blockIdx.x;   // block -> (batch, anchor)
+    int batch_idx = block_id / num_anchors;
+    int anchor_local = block_id % num_anchors;
+    int anchor_index = batch_idx * num_anchors + anchor_local;
+
+    int channel = threadIdx.x;       // thread -> channel
+    if(channel >= num_embeds) return;
+
+    double accum = 0.0;
+    // 多维度并行改成了串行遍历
+    // 原版本中做一次采样即需要进行一次原子加，现在的版本能减少原子加的次数，1 比 2 少了 (num_pts × num_cams × num_scale) 倍的 atomicAdd
+
+    for(int p=0; p<num_pts; ++p) {
+        for(int cam=0; cam<num_cams; ++cam) {
+            int loc_offset = (((anchor_index * num_pts + p) * num_cams + cam) << 1);
+            float loc_w = sample_location[loc_offset + 0];
+            float loc_h = sample_location[loc_offset + 1];
+            if(!(loc_w>0.f && loc_w<1.f && loc_h>0.f && loc_h<1.f)) continue;
+
+            for(int s=0; s<num_scale; ++s) {
+                int cam_scale_index = cam * num_scale + s;
+                int sp_base = cam_scale_index * 2;
+                int H = spatial_shape[sp_base + 0];
+                int W = spatial_shape[sp_base + 1];
+
+                float h_im = loc_h * H - 0.5f;
+                float w_im = loc_w * W - 0.5f;
+
+                int feat_map_idx = batch_idx * num_feat + scale_start_index[cam_scale_index];
+                int base_ptr = feat_map_idx * num_embeds + channel;
+
+                float sampled = bilinear_sampling(mc_ms_feat, H, W, num_embeds, h_im, w_im, base_ptr);
+
+                int embeds_per_group = num_embeds / num_groups;
+                int group = embeds_per_group>0 ? (channel / embeds_per_group) : 0;
+                int w_idx = (((((anchor_index*num_pts + p)*num_cams + cam)*num_scale + s)*num_groups)+group);
+                double w_val = double(weights[w_idx]);
+                
+                accum += (double)sampled * (double)w_val;
+            }
+        }
+    }
+    float result=float(accum);
+
+#ifdef __gfx936__
+    __builtin_amdgcn_global_atomic_fadd_f32(output + anchor_index * num_embeds + channel, result);
+#else
+    atomicAdd(output + anchor_index * num_embeds + channel, result);
+#endif
+}
+
+
+__global__ void __launch_bounds__(1024, 1)
+deformable_aggregation_kernel(
+    const int64_t num_kernels,
+    float* output,
+    const float* mc_ms_feat,
+    const int* spatial_shape,
+    const int* scale_start_index,
+    const float* sample_location,
+    const float* weights,
+    int batch_size,
+    int num_cams,
+    int num_feat,
+    int num_embeds,
+    int num_scale,
+    int num_anchors,
+    int num_pts,
+    int num_groups
+) {
+    int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= num_kernels) return;
+
+    const float weight = *(weights + idx / (num_embeds / num_groups));
+    const int channel_index = idx % num_embeds;
+    idx /= num_embeds;
+    const int scale_index = idx % num_scale;
+    idx /= num_scale;
+
+    const int cam_index = idx % num_cams;
+    idx /= num_cams;
+    const int pts_index = idx % num_pts;
+    idx /= num_pts;
+
+    int anchor_index = idx % num_anchors;
+    idx /= num_anchors;
+    const int batch_index = idx % batch_size;
+    idx /= batch_size;
+
+    anchor_index = batch_index * num_anchors + anchor_index;
+    const int loc_offset = ((anchor_index * num_pts + pts_index) * num_cams + cam_index) << 1;
+
+    const float loc_w = sample_location[loc_offset];
+    if (loc_w <= 0 || loc_w >= 1) return;
+    const float loc_h = sample_location[loc_offset + 1];
+    if (loc_h <= 0 || loc_h >= 1) return;
+    
+    int cam_scale_index = cam_index * num_scale + scale_index;
+    const int value_offset = (batch_index * num_feat + scale_start_index[cam_scale_index]) * num_embeds + channel_index;
+
+    cam_scale_index = cam_scale_index << 1;
+    const int h = spatial_shape[cam_scale_index];
+    const int w = spatial_shape[cam_scale_index + 1];
+
+    const float h_im = loc_h * h - 0.5;
+    const float w_im = loc_w * w - 0.5;
+
+    atomicAdd(
+        output + anchor_index * num_embeds + channel_index,
+        bilinear_sampling(mc_ms_feat, h, w, num_embeds, h_im, w_im, value_offset) * weight
+    );
+
+}
+
+void deformable_aggregation(
+    float* output,
+    const float* mc_ms_feat,
+    const int* spatial_shape,
+    const int* scale_start_index,
+    const float* sample_location,
+    const float* weights,
+    int batch_size,
+    int num_cams,
+    int num_feat,
+    int num_embeds,
+    int num_scale,
+    int num_anchors,
+    int num_pts,
+    int num_groups
+) {
+    // new grid: one block per (batch, anchor)
+    const int grid = batch_size * num_anchors;
+
+    // choose block threads: try to use a power-of-two near num_embeds but <= 1024
+    int threads = 256;
+    if (num_embeds <= 128) threads = 128;
+    else if (num_embeds < 256) threads = 256;
+    else if (num_embeds <= 512) threads = 512;
+    else threads = 1024;
+
+    deformable_aggregation_kernel_sp<<< grid, threads >>>(
+      0,output, mc_ms_feat, spatial_shape, scale_start_index, sample_location, weights,
+        batch_size, num_cams, num_feat, num_embeds, num_scale, num_anchors, num_pts, num_groups
+    );
+   
+}
+
+void deformable_aggregation_ref(
+    float* output,
+    const float* mc_ms_feat,
+    const int* spatial_shape,
+    const int* scale_start_index,
+    const float* sample_location,
+    const float* weights,
+    int batch_size,
+    int num_cams,
+    int num_feat,
+    int num_embeds,
+    int num_scale,
+    int num_anchors,
+    int num_pts,
+    int num_groups
+) {
+    const int64_t num_kernels = (int64_t)batch_size * num_pts * num_embeds * num_anchors * num_cams * num_scale;
+    deformable_aggregation_kernel
+        <<<(int)ceil(((double)num_kernels/128)), 128>>>(
+        num_kernels, output,
+        mc_ms_feat, spatial_shape, scale_start_index, sample_location, weights,
+        batch_size, num_cams, num_feat, num_embeds, num_scale, num_anchors, num_pts, num_groups
+    );
+}
+
+
--- a/projects/mmdet3d_plugin/ops/deformable_aggregation_ext/src/deformable_aggregation_cuda.hip
+++ b/projects/mmdet3d_plugin/ops/deformable_aggregation_ext/src/deformable_aggregation_cuda.hip
+// !!! This is a file automatically generated by hipify!!!
+#include <ATen/dtk_macros.h>
+#include <ATen/ATen.h>
+#include <ATen/hip/HIPContext.h>
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime.h>
+
+#include <THH/THHAtomics.cuh>
+
+#include <iostream>
+#include <stdlib.h>
+
+
+__device__ float bilinear_sampling(
+    const float *&bottom_data, const int &height, const int &width,
+    const int &num_embeds, const float &h_im, const float &w_im,
+    const int &base_ptr
+) {
+  const int h_low = floorf(h_im);
+  const int w_low = floorf(w_im);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const float lh = h_im - h_low;
+  const float lw = w_im - w_low;
+  const float hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = num_embeds;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+
+  float v1 = 0;
+  if (h_low >= 0 && w_low >= 0) {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+  }
+  float v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1) {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+  }
+  float v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0) {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+  }
+  float v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1) {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+  }
+
+  const float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  const float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+struct float2_t{
+  float a;
+  float b;
+};
+
+__forceinline__ __device__
+float2_t warp_reduce_sum(float2_t val, int max = 32) {
+  for (int offset = max; offset > 0; offset >>= 1 ) {
+    val.a += __shfl_down(val.a, offset);
+    val.b += __shfl_down(val.b, offset);
+  }
+  return val;
+}
+
+template <int blocksize>
+__forceinline__ __device__
+float2_t block_reduce_sum(float2_t val, float2_t* shared) {
+  const int lid = threadIdx.x % 64;
+  const int wid = threadIdx.x / 64;
+  constexpr int share_size = blocksize / 64;
+
+  val = warp_reduce_sum(val);
+  
+  if constexpr (blocksize == 64) return val;
+
+  if (lid == 0 && wid < share_size) {
+    shared[wid] = val;
+  }
+
+  __syncthreads();
+
+  if (wid == 0 && lid < share_size) {
+    val = shared[lid];
+    val = warp_reduce_sum(val, share_size / 2);
+  }
+
+  return val;
+
+}
+
+__global__ void deformable_aggregation_kernel_sp(
+    const int64_t num_kernels,
+    float* output,
+    const float* mc_ms_feat,
+    const int* spatial_shape,
+    const int* scale_start_index,
+    const float* sample_location,
+    const float* weights,
+    int batch_size,
+    int num_cams,
+    int num_feat,
+    int num_embeds,
+    int num_scale,
+    int num_anchors,
+    int num_pts,
+    int num_groups
+) {
+    int64_t block_id = blockIdx.x;   // block -> (batch, anchor)
+    int batch_idx = block_id / num_anchors;
+    int anchor_local = block_id % num_anchors;
+    int anchor_index = batch_idx * num_anchors + anchor_local;
+
+    int channel = threadIdx.x;       // thread -> channel
+    if(channel >= num_embeds) return;
+
+    double accum = 0.0;
+    // 多维度并行改成了串行遍历
+    // 原版本中做一次采样即需要进行一次原子加，现在的版本能减少原子加的次数，1 比 2 少了 (num_pts × num_cams × num_scale) 倍的 atomicAdd
+
+    for(int p=0; p<num_pts; ++p) {
+        for(int cam=0; cam<num_cams; ++cam) {
+            int loc_offset = (((anchor_index * num_pts + p) * num_cams + cam) << 1);
+            float loc_w = sample_location[loc_offset + 0];
+            float loc_h = sample_location[loc_offset + 1];
+            if(!(loc_w>0.f && loc_w<1.f && loc_h>0.f && loc_h<1.f)) continue;
+
+            for(int s=0; s<num_scale; ++s) {
+                int cam_scale_index = cam * num_scale + s;
+                int sp_base = cam_scale_index * 2;
+                int H = spatial_shape[sp_base + 0];
+                int W = spatial_shape[sp_base + 1];
+
+                float h_im = loc_h * H - 0.5f;
+                float w_im = loc_w * W - 0.5f;
+
+                int feat_map_idx = batch_idx * num_feat + scale_start_index[cam_scale_index];
+                int base_ptr = feat_map_idx * num_embeds + channel;
+
+                float sampled = bilinear_sampling(mc_ms_feat, H, W, num_embeds, h_im, w_im, base_ptr);
+
+                int embeds_per_group = num_embeds / num_groups;
+                int group = embeds_per_group>0 ? (channel / embeds_per_group) : 0;
+                int w_idx = (((((anchor_index*num_pts + p)*num_cams + cam)*num_scale + s)*num_groups)+group);
+                double w_val = double(weights[w_idx]);
+                
+                accum += (double)sampled * (double)w_val;
+            }
+        }
+    }
+    float result=float(accum);
+
+#ifdef __gfx936__
+    __builtin_amdgcn_global_atomic_fadd_f32(output + anchor_index * num_embeds + channel, result);
+#else
+    atomicAdd(output + anchor_index * num_embeds + channel, result);
+#endif
+}
+
+
+__global__ void __launch_bounds__(1024, 1)
+deformable_aggregation_kernel(
+    const int64_t num_kernels,
+    float* output,
+    const float* mc_ms_feat,
+    const int* spatial_shape,
+    const int* scale_start_index,
+    const float* sample_location,
+    const float* weights,
+    int batch_size,
+    int num_cams,
+    int num_feat,
+    int num_embeds,
+    int num_scale,
+    int num_anchors,
+    int num_pts,
+    int num_groups
+) {
+    int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= num_kernels) return;
+
+    const float weight = *(weights + idx / (num_embeds / num_groups));
+    const int channel_index = idx % num_embeds;
+    idx /= num_embeds;
+    const int scale_index = idx % num_scale;
+    idx /= num_scale;
+
+    const int cam_index = idx % num_cams;
+    idx /= num_cams;
+    const int pts_index = idx % num_pts;
+    idx /= num_pts;
+
+    int anchor_index = idx % num_anchors;
+    idx /= num_anchors;
+    const int batch_index = idx % batch_size;
+    idx /= batch_size;
+
+    anchor_index = batch_index * num_anchors + anchor_index;
+    const int loc_offset = ((anchor_index * num_pts + pts_index) * num_cams + cam_index) << 1;
+
+    const float loc_w = sample_location[loc_offset];
+    if (loc_w <= 0 || loc_w >= 1) return;
+    const float loc_h = sample_location[loc_offset + 1];
+    if (loc_h <= 0 || loc_h >= 1) return;
+    
+    int cam_scale_index = cam_index * num_scale + scale_index;
+    const int value_offset = (batch_index * num_feat + scale_start_index[cam_scale_index]) * num_embeds + channel_index;
+
+    cam_scale_index = cam_scale_index << 1;
+    const int h = spatial_shape[cam_scale_index];
+    const int w = spatial_shape[cam_scale_index + 1];
+
+    const float h_im = loc_h * h - 0.5;
+    const float w_im = loc_w * w - 0.5;
+
+    atomicAdd(
+        output + anchor_index * num_embeds + channel_index,
+        bilinear_sampling(mc_ms_feat, h, w, num_embeds, h_im, w_im, value_offset) * weight
+    );
+
+}
+
+void deformable_aggregation(
+    float* output,
+    const float* mc_ms_feat,
+    const int* spatial_shape,
+    const int* scale_start_index,
+    const float* sample_location,
+    const float* weights,
+    int batch_size,
+    int num_cams,
+    int num_feat,
+    int num_embeds,
+    int num_scale,
+    int num_anchors,
+    int num_pts,
+    int num_groups
+) {
+    // new grid: one block per (batch, anchor)
+    const int grid = batch_size * num_anchors;
+
+    // choose block threads: try to use a power-of-two near num_embeds but <= 1024
+    int threads = 256;
+    if (num_embeds <= 128) threads = 128;
+    else if (num_embeds < 256) threads = 256;
+    else if (num_embeds <= 512) threads = 512;
+    else threads = 1024;
+
+   hipLaunchKernelGGL(( deformable_aggregation_kernel_sp),  dim3(grid), dim3(threads) , 0, 0, 
+      0,output, mc_ms_feat, spatial_shape, scale_start_index, sample_location, weights,
+        batch_size, num_cams, num_feat, num_embeds, num_scale, num_anchors, num_pts, num_groups
+    );
+   
+}
+
+void deformable_aggregation_ref(
+    float* output,
+    const float* mc_ms_feat,
+    const int* spatial_shape,
+    const int* scale_start_index,
+    const float* sample_location,
+    const float* weights,
+    int batch_size,
+    int num_cams,
+    int num_feat,
+    int num_embeds,
+    int num_scale,
+    int num_anchors,
+    int num_pts,
+    int num_groups
+) {
+    const int64_t num_kernels = (int64_t)batch_size * num_pts * num_embeds * num_anchors * num_cams * num_scale;
+   hipLaunchKernelGGL(( deformable_aggregation_kernel)
+        , dim3((int)ceil(((double)num_kernels/128))), dim3(128), 0, 0, 
+        num_kernels, output,
+        mc_ms_feat, spatial_shape, scale_start_index, sample_location, weights,
+        batch_size, num_cams, num_feat, num_embeds, num_scale, num_anchors, num_pts, num_groups
+    );
+}
+
+
--- a/projects/mmdet3d_plugin/ops/deformable_aggregation_ext/src/deformable_aggregation_hip.cpp
+++ b/projects/mmdet3d_plugin/ops/deformable_aggregation_ext/src/deformable_aggregation_hip.cpp
+// !!! This is a file automatically generated by hipify!!!
+#include <ATen/dtk_macros.h>
+      
+#include <torch/extension.h>
+#include <ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h>
+
+void deformable_aggregation(
+  float* output,
+  const float* mc_ms_feat,
+  const int* spatial_shape,
+  const int* scale_start_index,
+  const float* sample_location,
+  const float* weights,
+  int batch_size,
+  int num_cams,
+  int num_feat,
+  int num_embeds,
+  int num_scale,
+  int num_anchors,
+  int num_pts,
+  int num_groups
+);
+
+void deformable_aggregation_ref(
+  float* output,
+  const float* mc_ms_feat,
+  const int* spatial_shape,
+  const int* scale_start_index,
+  const float* sample_location,
+  const float* weights,
+  int batch_size,
+  int num_cams,
+  int num_feat,
+  int num_embeds,
+  int num_scale,
+  int num_anchors,
+  int num_pts,
+  int num_groups
+);  
+
+at::Tensor deformable_aggregation_forward(
+  const at::Tensor &_mc_ms_feat,
+  const at::Tensor &_spatial_shape,
+  const at::Tensor &_scale_start_index,
+  const at::Tensor &_sampling_location,
+  const at::Tensor &_weights
+) {
+  at::DeviceGuard guard(_mc_ms_feat.device());
+  const at::hip::OptionalHIPGuardMasqueradingAsCUDA device_guard(device_of(_mc_ms_feat));
+  int batch_size = _mc_ms_feat.size(0);
+  int num_feat = _mc_ms_feat.size(1);
+  int num_embeds = _mc_ms_feat.size(2);
+  int num_cams = _spatial_shape.size(0);
+  int num_scale = _spatial_shape.size(1);
+  int num_anchors = _sampling_location.size(1);
+  int num_pts = _sampling_location.size(2);
+  int num_groups = _weights.size(5);
+
+  const float* mc_ms_feat = _mc_ms_feat.data_ptr<float>();
+  const int* spatial_shape = _spatial_shape.data_ptr<int>();
+  const int* scale_start_index = _scale_start_index.data_ptr<int>();
+  const float* sampling_location = _sampling_location.data_ptr<float>();
+  const float* weights = _weights.data_ptr<float>();
+
+  auto output = at::zeros({batch_size, num_anchors, num_embeds}, _mc_ms_feat.options());
+  int warm_up = 10;
+  int prof_cnt = 1;
+  hipEvent_t start, stop;
+  float milliseconds = 0;
+  for (int _i = 0; _i < warm_up; _i++) {
+    deformable_aggregation(
+      output.data_ptr<float>(),
+      mc_ms_feat, spatial_shape, scale_start_index, sampling_location, weights,
+      batch_size, num_cams, num_feat, num_embeds, num_scale, num_anchors, num_pts, num_groups
+    );
+  }
+  hipEventCreate(&start);
+  hipEventCreate(&stop); 
+
+  hipEventRecord(start);
+  for (int _i = 0; _i < prof_cnt; _i++) {
+    deformable_aggregation(
+      output.data_ptr<float>(),
+      mc_ms_feat, spatial_shape, scale_start_index, sampling_location, weights,
+      batch_size, num_cams, num_feat, num_embeds, num_scale, num_anchors, num_pts, num_groups
+    );
+  }
+  hipEventRecord(stop);
+
+  hipEventSynchronize(stop);
+
+  hipEventElapsedTime(&milliseconds, start, stop);
+
+  printf("优化后Kernel 执行时间: %.3f 毫秒\n", milliseconds / prof_cnt);
+
+  hipEventDestroy(start);
+  hipEventDestroy(stop);
+
+  return output;
+}
+
+at::Tensor deformable_aggregation_forward_ref(
+  const at::Tensor &_mc_ms_feat,
+  const at::Tensor &_spatial_shape,
+  const at::Tensor &_scale_start_index,
+  const at::Tensor &_sampling_location,
+  const at::Tensor &_weights
+) {
+  at::DeviceGuard guard(_mc_ms_feat.device());
+  const at::hip::OptionalHIPGuardMasqueradingAsCUDA device_guard(device_of(_mc_ms_feat));
+  int batch_size = _mc_ms_feat.size(0);
+  int num_feat = _mc_ms_feat.size(1);
+  int num_embeds = _mc_ms_feat.size(2);
+  int num_cams = _spatial_shape.size(0);
+  int num_scale = _spatial_shape.size(1);
+  int num_anchors = _sampling_location.size(1);
+  int num_pts = _sampling_location.size(2);
+  int num_groups = _weights.size(5);
+
+  const float* mc_ms_feat = _mc_ms_feat.data_ptr<float>();
+  const int* spatial_shape = _spatial_shape.data_ptr<int>();
+  const int* scale_start_index = _scale_start_index.data_ptr<int>();
+  const float* sampling_location = _sampling_location.data_ptr<float>();
+  const float* weights = _weights.data_ptr<float>();
+
+  auto output = at::zeros({batch_size, num_anchors, num_embeds}, _mc_ms_feat.options());
+  int warm_up = 10;
+  int prof_cnt = 1;
+  hipEvent_t start, stop;
+  float milliseconds = 0;
+
+  for (int _i = 0; _i < warm_up; _i++) {
+  deformable_aggregation_ref(
+    output.data_ptr<float>(),
+    mc_ms_feat, spatial_shape, scale_start_index, sampling_location, weights,
+    batch_size, num_cams, num_feat, num_embeds, num_scale, num_anchors, num_pts, num_groups
+  );
+  }
+
+
+  // 创建事件对象
+  hipEventCreate(&start);
+  hipEventCreate(&stop); 
+
+  hipEventRecord(start);
+  for (int _i = 0; _i < prof_cnt; _i++) {
+    deformable_aggregation_ref(
+      output.data_ptr<float>(),
+      mc_ms_feat, spatial_shape, scale_start_index, sampling_location, weights,
+      batch_size, num_cams, num_feat, num_embeds, num_scale, num_anchors, num_pts, num_groups
+    );
+  }
+
+  hipEventRecord(stop);
+
+  hipEventSynchronize(stop);
+
+  hipEventElapsedTime(&milliseconds, start, stop);
+
+  printf("优化前Kernel 执行时间: %.3f 毫秒\n", milliseconds / prof_cnt);
+
+  hipEventDestroy(start);
+  hipEventDestroy(stop);
+
+  return output;
+}
+
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def(
+    "deformable_aggregation_forward",
+    &deformable_aggregation_forward,
+    "deformable_aggregation_forward"
+  );
+  m.def(
+    "deformable_aggregation_forward_ref",
+    &deformable_aggregation_forward_ref,
+    "deformable_aggregation_forward_ref"
+  );
+
+}
+
+    
\ No newline at end of file
--- a/projects/mmdet3d_plugin/ops/setup.py
+++ b/projects/mmdet3d_plugin/ops/setup.py
+import os
+
+import torch
+from setuptools import setup
+from torch.utils.cpp_extension import (
+    BuildExtension,
+    CppExtension,
+    CUDAExtension,
+)
+
+
+def make_cuda_ext(
+    name,
+    module,
+    sources,
+    sources_cuda=[],
+    extra_args=[],
+    extra_include_path=[],
+):
+
+    define_macros = []
+    extra_compile_args = {"cxx": [] + extra_args}
+
+    if torch.cuda.is_available() or os.getenv("FORCE_CUDA", "0") == "1":
+        define_macros += [("WITH_CUDA", None)]
+        extension = CUDAExtension
+        extra_compile_args["nvcc"] = extra_args + [
+            "-D__CUDA_NO_HALF_OPERATORS__",
+            "-D__CUDA_NO_HALF_CONVERSIONS__",
+            "-D__CUDA_NO_HALF2_OPERATORS__",
+        ]
+        sources += sources_cuda
+    else:
+        print("Compiling {} without CUDA".format(name))
+        extension = CppExtension
+
+    return extension(
+        name="{}.{}".format(module, name),
+        sources=[os.path.join(*module.split("."), p) for p in sources],
+        include_dirs=extra_include_path,
+        define_macros=define_macros,
+        extra_compile_args=extra_compile_args,
+    )
+
+
+if __name__ == "__main__":
+    setup(
+        name="deformable_aggregation_ext",
+        ext_modules=[
+            make_cuda_ext(
+                "deformable_aggregation_ext",
+                module="deformable_aggregation_ext",
+                sources=[
+                    f"src/deformable_aggregation.cpp",
+                    f"src/deformable_aggregation_cuda.cu",
+                ],
+            ),
+        ],
+        cmdclass={"build_ext": BuildExtension},
+    )