首次提交

34e4011b · zk · 34e4011b · 34e4011b · 34e4011b · 34e4011b
Commit 34e4011b authored Apr 14, 2026 by zk
20 changed files
--- a/onnxoptim.py
+++ b/onnxoptim.py
+import onnx
+import onnxoptimizer as optimizer
+
+def optimize_onnx_model(input_onnx_path, output_onnx_path, fp16=False):
+    """
+    优化ONNX模型，支持基础优化+可选FP16量化
+    :param input_onnx_path: 原始ONNX文件路径
+    :param output_onnx_path: 优化后保存路径
+    :param fp16: 是否开启FP16量化（需GPU支持，速度提升更明显）
+    """
+    # 1. 加载原始模型
+    model = onnx.load(input_onnx_path)
+    
+    # 2. 定义优化策略（按需增减，下面是通用高效的组合）
+    optimize_passes = [
+        "eliminate_deadend",          # 删除无输出的节点
+        "eliminate_identity",         # 移除Identity算子
+        "eliminate_unused_initializer",# 删除未使用的初始化参数
+        "fuse_bn_into_conv",          # 将BN融合到Conv中（核心优化）
+        "fuse_consecutive_concats"   # 融合连续的Concat算子
+    ]
+    
+    # 3. 执行基础优化
+    optimized_model = optimizer.optimize(model, optimize_passes)
+    
+    # 4. 可选：FP16量化（GPU推理提速显著，精度损失可控）
+    # if fp16:
+    #     from onnxconverter_common import float16
+    #     optimized_model = float16.convert_float_to_float16(
+    #         optimized_model,
+    #         keep_io_types=True,  # 保持输入输出为FP32，避免数据类型兼容问题
+    #         disable_shape_infer=False
+    #     )
+    
+    # 5. 检查优化后模型合法性并保存
+    onnx.checker.check_model(optimized_model)
+    onnx.save(optimized_model, output_onnx_path)
+    print(f"优化后的模型已保存至：{output_onnx_path}")
+
+# 调用示例
+if __name__ == "__main__":
+    # 替换为你的ONNX文件路径
+    input_onnx = "weights/ground_simplified.onnx"
+    output_onnx = "weights/ground_simoptim.onnx"
+    # 建议先试False（FP32），GPU环境再开True（FP16）
+    optimize_onnx_model(input_onnx, output_onnx, fp16=False)
\ No newline at end of file
--- a/ort_plugin/CMakeLists.txt
+++ b/ort_plugin/CMakeLists.txt
+cmake_minimum_required(VERSION 3.14)
+project(GroundingDinoORTPlugin)
+
+# 设置 C++ 标准
+set(CMAKE_CXX_STANDARD 14)
+
+# 告诉 CMake 使用 hipcc 编译器
+set(CMAKE_CXX_COMPILER "hipcc")
+set(CMAKE_C_COMPILER "hipcc")
+
+# 1. 寻找 ONNX Runtime 头文件
+# 请替换为你环境中实际的 onnxruntime_cxx_api.h 所在路径
+set(ONNXRUNTIME_INCLUDE_DIR "/opt/dtk-25.04.2/onnxruntime/include")
+include_directories(${ONNXRUNTIME_INCLUDE_DIR})
+include_directories(.) # 把当前目录加进去，方便找到 .cuh
+
+# 2. 编译我们的共享库 (.so)
+add_library(ms_deform_attn_ort SHARED 
+    ms_deform_attn_ort.hip
+)
+
+# =========================================================================
+# 【关键修复】告诉 CMake 把 .hip 文件当作 C++ 语言来编译和链接
+# =========================================================================
+set_source_files_properties(ms_deform_attn_ort.hip PROPERTIES LANGUAGE CXX)
+set_target_properties(ms_deform_attn_ort PROPERTIES LINKER_LANGUAGE CXX)
+
+# 编译优化
+# target_compile_options(ms_deform_attn_ort PRIVATE -O3 -fPIC)
+target_compile_options(ms_deform_attn_ort PRIVATE -O3 -fPIC --gpu-max-threads-per-block=1024)
\ No newline at end of file
--- a/ort_plugin/ms_deform_attn_ort.hip
+++ b/ort_plugin/ms_deform_attn_ort.hip
+// 【关键修复 1】定义这个宏，关闭 ORT 自动寻找 OrtGetApiBase 的行为
+#define ORT_API_MANUAL_INIT
+
+#include <onnxruntime_c_api.h>
+#include <onnxruntime_cxx_api.h>
+#include <hip/hip_runtime.h>
+#include <vector>
+#include <cmath>
+#include <algorithm>
+
+// 引入核心计算头文件
+#include "ms_deform_im2col_cuda_hip.cuh"
+
+// ============================================================================
+// 1. 定义 Kernel
+// ============================================================================
+struct MsDeformAttnKernel {
+    const OrtApi* api_;
+    int im2col_step_;
+
+    MsDeformAttnKernel(const OrtApi* api, const OrtKernelInfo* info) : api_(api) {
+        int64_t step = 64;
+        OrtStatus* status = api_->KernelInfoGetAttribute_int64(info, "im2col_step_i", &step);
+        if (status != nullptr) {
+            api_->ReleaseStatus(status);
+            im2col_step_ = 64; // 【修复1】加上兜底赋值
+        } else {
+            im2col_step_ = static_cast<int>(step);
+        }
+        im2col_step_ = std::max(1, im2col_step_);
+    }
+
+    void CheckStatus(OrtStatus* status) {
+        if (status != nullptr) {
+            api_->ReleaseStatus(status);
+        }
+    }
+
+    std::vector<int64_t> GetTensorDims(const OrtValue* tensor) {
+        OrtTensorTypeAndShapeInfo* info;
+        CheckStatus(api_->GetTensorTypeAndShape(tensor, &info));
+        
+        size_t dim_count;
+        CheckStatus(api_->GetDimensionsCount(info, &dim_count));
+        
+        std::vector<int64_t> dims(dim_count);
+        CheckStatus(api_->GetDimensions(info, dims.data(), dim_count));
+        
+        api_->ReleaseTensorTypeAndShapeInfo(info);
+        return dims;
+    }
+
+    void Compute(OrtKernelContext* context) {
+        OrtStatusPtr status = ComputeV2(context);
+        if (status != nullptr) {
+            api_->ReleaseStatus(status);
+        }
+    }
+
+    OrtStatusPtr ComputeV2(OrtKernelContext* context) {
+        const OrtValue* value_tensor;
+        const OrtValue* spatial_shapes_tensor;
+        const OrtValue* level_start_index_tensor;
+        const OrtValue* sampling_loc_tensor;
+        const OrtValue* attn_weight_tensor;
+
+        CheckStatus(api_->KernelContext_GetInput(context, 0, &value_tensor));
+        CheckStatus(api_->KernelContext_GetInput(context, 1, &spatial_shapes_tensor));
+        CheckStatus(api_->KernelContext_GetInput(context, 2, &level_start_index_tensor));
+        CheckStatus(api_->KernelContext_GetInput(context, 3, &sampling_loc_tensor));
+        CheckStatus(api_->KernelContext_GetInput(context, 4, &attn_weight_tensor));
+
+        void* value_ptr;
+        void* spatial_shapes_ptr;
+        void* level_start_index_ptr;
+        void* sampling_loc_ptr;
+        void* attn_weight_ptr;
+
+        CheckStatus(api_->GetTensorMutableData(const_cast<OrtValue*>(value_tensor), &value_ptr));
+        CheckStatus(api_->GetTensorMutableData(const_cast<OrtValue*>(spatial_shapes_tensor), &spatial_shapes_ptr));
+        CheckStatus(api_->GetTensorMutableData(const_cast<OrtValue*>(level_start_index_tensor), &level_start_index_ptr));
+        CheckStatus(api_->GetTensorMutableData(const_cast<OrtValue*>(sampling_loc_tensor), &sampling_loc_ptr));
+        CheckStatus(api_->GetTensorMutableData(const_cast<OrtValue*>(attn_weight_tensor), &attn_weight_ptr));
+
+        const float* value_data = reinterpret_cast<const float*>(value_ptr);
+        const int64_t* spatial_shapes_data = reinterpret_cast<const int64_t*>(spatial_shapes_ptr);
+        const int64_t* level_start_index_data = reinterpret_cast<const int64_t*>(level_start_index_ptr);
+        const float* sampling_loc_data = reinterpret_cast<const float*>(sampling_loc_ptr);
+        const float* attn_weight_data = reinterpret_cast<const float*>(attn_weight_ptr);
+
+        auto value_dims = GetTensorDims(value_tensor);
+        auto spatial_shapes_dims = GetTensorDims(spatial_shapes_tensor);
+        auto sampling_loc_dims = GetTensorDims(sampling_loc_tensor);
+
+        const int batch = value_dims[0];
+        const int spatial_size = value_dims[1];
+        const int num_heads = value_dims[2];
+        const int channels = value_dims[3];
+        const int num_levels = spatial_shapes_dims[0];
+        const int num_query = sampling_loc_dims[1];
+        const int num_point = sampling_loc_dims[4];
+
+        std::vector<int64_t> output_dims = {batch, num_query, num_heads * channels};
+        OrtValue* output_tensor;
+        CheckStatus(api_->KernelContext_GetOutput(context, 0, output_dims.data(), output_dims.size(), &output_tensor));
+        
+        void* output_ptr;
+        CheckStatus(api_->GetTensorMutableData(output_tensor, &output_ptr));
+        float* output_data = reinterpret_cast<float*>(output_ptr);
+
+        void* stream_ptr;
+        CheckStatus(api_->KernelContext_GetGPUComputeStream(context, &stream_ptr));
+        hipStream_t stream = reinterpret_cast<hipStream_t>(stream_ptr);
+
+        // const int im2col_step_real = std::min(batch, im2col_step_);
+        const int im2col_step_real = std::max(1, std::min(batch, im2col_step_));
+        const int batch_n = im2col_step_real;
+        
+        auto per_value_size = spatial_size * num_heads * channels;
+        auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
+        auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
+        auto per_output_size = batch_n * num_query * num_heads * channels;
+
+        for (int n = 0; n < batch / im2col_step_real; ++n) {
+            ms_deformable_im2col_cuda(
+                stream,
+                value_data + n * im2col_step_real * per_value_size,
+                spatial_shapes_data,
+                level_start_index_data,
+                sampling_loc_data + n * im2col_step_real * per_sample_loc_size,
+                attn_weight_data + n * im2col_step_real * per_attn_weight_size,
+                batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
+                output_data + n * per_output_size
+            );
+        }
+
+        return nullptr;
+    }
+};
+
+// ============================================================================
+// 2. 算子接口封装
+// ============================================================================
+struct MsDeformAttnCustomOp : Ort::CustomOpBase<MsDeformAttnCustomOp, MsDeformAttnKernel> {
+    
+    void* CreateKernel(const OrtApi& api, const OrtKernelInfo* info) const {
+        return new MsDeformAttnKernel(&api, info);
+    }
+
+    OrtStatusPtr CreateKernelV2(const OrtApi& api, const OrtKernelInfo* info, void** op_kernel) const {
+        *op_kernel = new MsDeformAttnKernel(&api, info);
+        return nullptr;
+    }
+
+    const char* GetName() const { return "ms_deform_attn"; }
+    const char* GetExecutionProviderType() const { return "ROCMExecutionProvider"; }
+
+    size_t GetInputTypeCount() const { return 5; }
+    ONNXTensorElementDataType GetInputType(size_t index) const {
+        if (index == 0 || index == 3 || index == 4) return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+        return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
+    }
+
+    size_t GetOutputTypeCount() const { return 1; }
+    ONNXTensorElementDataType GetOutputType(size_t index) const {
+        return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+    }
+};
+
+// ============================================================================
+// 3. 注册入口
+// ============================================================================
+MsDeformAttnCustomOp ms_deform_attn_op;
+
+extern "C" OrtStatus* RegisterCustomOps(OrtSessionOptions* options, const OrtApiBase* api) {
+    // 【关键修复 2】手动初始化 C++ API，把 Python 传来的真实 API 指针交给 ORT
+    Ort::InitApi(api->GetApi(ORT_API_VERSION));
+
+    OrtCustomOpDomain* domain = nullptr;
+    const OrtApi* ortApi = api->GetApi(ORT_API_VERSION);
+
+    if (auto status = ortApi->CreateCustomOpDomain("custom", &domain)) { return status; }
+    if (auto status = ortApi->CustomOpDomain_Add(domain, &ms_deform_attn_op)) { return status; }
+    return ortApi->AddCustomOpDomain(options, domain);
+}
\ No newline at end of file
--- a/ort_plugin/ms_deform_im2col_cuda_hip.cuh
+++ b/ort_plugin/ms_deform_im2col_cuda_hip.cuh
+// !!! This is a file automatically generated by hipify!!!
+// #include <ATen/dtk_macros.h>
+#include "hip/hip_runtime.h"
+/*!
+**************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************
+* Modified from DCN (https://github.com/msracver/Deformable-ConvNets)
+* Copyright (c) 2018 Microsoft
+**************************************************************************
+*/
+
+#include <cstdio>
+#include <algorithm>
+#include <cstring>
+
+// #include <ATen/ATen.h>
+// #include <ATen/hip/HIPContext.h>
+
+// #include <THH/THHAtomics.cuh>
+
+#define CUDA_KERNEL_LOOP(i, n)                          \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x;   \
+      i < (n);                                          \
+      i += blockDim.x * gridDim.x)
+
+const int CUDA_NUM_THREADS = 1024;
+inline int GET_BLOCKS(const int N, const int num_threads)
+{
+  return (N + num_threads - 1) / num_threads;
+}
+
+
+template <typename scalar_t>
+__device__ scalar_t ms_deform_attn_im2col_bilinear(const scalar_t* &bottom_data, 
+                                                   const int &height, const int &width, const int &nheads, const int &channels,
+                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c)
+{
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+  {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+  {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+  {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+  {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+  }
+
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+
+template <typename scalar_t>
+__device__ void ms_deform_attn_col2im_bilinear(const scalar_t* &bottom_data, 
+                                                   const int &height, const int &width, const int &nheads, const int &channels,
+                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c,
+                                                   const scalar_t &top_grad,
+                                                   const scalar_t &attn_weight,
+                                                   scalar_t* &grad_value, 
+                                                   scalar_t* grad_sampling_loc,
+                                                   scalar_t* grad_attn_weight)
+{
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+  const scalar_t top_grad_value = top_grad * attn_weight;
+  scalar_t grad_h_weight = 0, grad_w_weight = 0;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+  {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+    grad_h_weight -= hw * v1;
+    grad_w_weight -= hh * v1;
+    atomicAdd(grad_value+ptr1, w1*top_grad_value);
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+  {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+    grad_h_weight -= lw * v2;
+    grad_w_weight += hh * v2;
+    atomicAdd(grad_value+ptr2, w2*top_grad_value);
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+  {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+    grad_h_weight += hw * v3;
+    grad_w_weight -= lh * v3;
+    atomicAdd(grad_value+ptr3, w3*top_grad_value); 
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+  {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+    grad_h_weight += lw * v4;
+    grad_w_weight += lh * v4;
+    atomicAdd(grad_value+ptr4, w4*top_grad_value);
+  }
+
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  *grad_attn_weight = top_grad * val;
+  *grad_sampling_loc = width * grad_w_weight * top_grad_value;
+  *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value;
+}
+
+
+template <typename scalar_t>
+__device__ void ms_deform_attn_col2im_bilinear_gm(const scalar_t* &bottom_data, 
+                                                   const int &height, const int &width, const int &nheads, const int &channels,
+                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c,
+                                                   const scalar_t &top_grad,
+                                                   const scalar_t &attn_weight,
+                                                   scalar_t* &grad_value, 
+                                                   scalar_t* grad_sampling_loc,
+                                                   scalar_t* grad_attn_weight)
+{
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+  const scalar_t top_grad_value = top_grad * attn_weight;
+  scalar_t grad_h_weight = 0, grad_w_weight = 0;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+  {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+    grad_h_weight -= hw * v1;
+    grad_w_weight -= hh * v1;
+    atomicAdd(grad_value+ptr1, w1*top_grad_value);
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+  {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+    grad_h_weight -= lw * v2;
+    grad_w_weight += hh * v2;
+    atomicAdd(grad_value+ptr2, w2*top_grad_value);
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+  {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+    grad_h_weight += hw * v3;
+    grad_w_weight -= lh * v3;
+    atomicAdd(grad_value+ptr3, w3*top_grad_value); 
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+  {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+    grad_h_weight += lw * v4;
+    grad_w_weight += lh * v4;
+    atomicAdd(grad_value+ptr4, w4*top_grad_value);
+  }
+
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  atomicAdd(grad_attn_weight, top_grad * val); 
+  atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value);
+  atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value);
+}
+
+
+template <typename scalar_t>
+__global__ void ms_deformable_im2col_gpu_kernel(const int n,
+                                                const scalar_t *data_value, 
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *data_col)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    scalar_t *data_col_ptr = data_col + index;
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+    scalar_t col = 0;
+    
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const scalar_t *data_value_ptr = data_value + (data_value_ptr_init_offset + level_start_id * qid_stride);
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          col += ms_deform_attn_im2col_bilinear(data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col) * weight;
+        }
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+      }
+    }
+    *data_col_ptr = col;
+  }
+}
+
+template <typename scalar_t, unsigned int blockSize>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
+    __shared__ scalar_t cache_grad_attn_weight[blockSize];
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        
+        __syncthreads();
+        if (tid == 0)
+        {
+          scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0];
+          int sid=2;
+          for (unsigned int tid = 1; tid < blockSize; ++tid)
+          {
+            _grad_w += cache_grad_sampling_loc[sid];
+            _grad_h += cache_grad_sampling_loc[sid + 1];
+            _grad_a += cache_grad_attn_weight[tid];
+            sid += 2;
+          }
+          
+          
+          *grad_sampling_loc = _grad_w;
+          *(grad_sampling_loc + 1) = _grad_h;
+          *grad_attn_weight = _grad_a;
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+
+template <typename scalar_t, unsigned int blockSize>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
+    __shared__ scalar_t cache_grad_attn_weight[blockSize];
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        
+        __syncthreads();
+
+        for (unsigned int s=blockSize/2; s>0; s>>=1)
+        {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0)
+        { 
+          *grad_sampling_loc = cache_grad_sampling_loc[0];
+          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
+          *grad_attn_weight = cache_grad_attn_weight[0];
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    extern __shared__ int _s[];
+    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
+    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        
+        __syncthreads();
+        if (tid == 0)
+        {
+          scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0];
+          int sid=2;
+          for (unsigned int tid = 1; tid < blockDim.x; ++tid)
+          {
+            _grad_w += cache_grad_sampling_loc[sid];
+            _grad_h += cache_grad_sampling_loc[sid + 1];
+            _grad_a += cache_grad_attn_weight[tid];
+            sid += 2;
+          }
+          
+          
+          *grad_sampling_loc = _grad_w;
+          *(grad_sampling_loc + 1) = _grad_h;
+          *grad_attn_weight = _grad_a;
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    extern __shared__ int _s[];
+    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
+    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        
+        __syncthreads();
+
+        for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1)
+        {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
+            if (tid + (s << 1) < spre)
+            {
+              cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)];
+              cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)];
+              cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
+            } 
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0)
+        {
+          *grad_sampling_loc = cache_grad_sampling_loc[0];
+          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
+          *grad_attn_weight = cache_grad_attn_weight[0];
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    extern __shared__ int _s[];
+    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
+    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        
+        __syncthreads();
+
+        for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1)
+        {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
+            if (tid + (s << 1) < spre)
+            {
+              cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)];
+              cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)];
+              cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
+            }
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0)
+        {
+          atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]);
+          atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]);
+          atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]);
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_gm(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear_gm(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            grad_sampling_loc, grad_attn_weight);
+        }
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+
+template <typename scalar_t>
+void ms_deformable_im2col_cuda(hipStream_t stream,
+                              const scalar_t* data_value,
+                              const int64_t* data_spatial_shapes, 
+                              const int64_t* data_level_start_index, 
+                              const scalar_t* data_sampling_loc,
+                              const scalar_t* data_attn_weight,
+                              const int batch_size,
+                              const int spatial_size, 
+                              const int num_heads, 
+                              const int channels, 
+                              const int num_levels, 
+                              const int num_query,
+                              const int num_point,
+                              scalar_t* data_col)
+{
+  const int num_kernels = batch_size * num_query * num_heads * channels;
+  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
+  const int num_threads = CUDA_NUM_THREADS;
+ hipLaunchKernelGGL(( ms_deformable_im2col_gpu_kernel<scalar_t>)
+      , dim3(GET_BLOCKS(num_actual_kernels, num_threads)), dim3(num_threads),
+          0, stream, 
+      num_kernels, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, 
+      batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, data_col);
+  
+  hipError_t err = hipGetLastError();
+  if (err != hipSuccess)
+  {
+    printf("error in ms_deformable_im2col_cuda: %s\n", hipGetErrorString(err));
+  }
+
+}
+
+template <typename scalar_t>
+void ms_deformable_col2im_cuda(hipStream_t stream,
+                              const scalar_t* grad_col,
+                              const scalar_t* data_value,
+                              const int64_t * data_spatial_shapes,
+                              const int64_t * data_level_start_index,
+                              const scalar_t * data_sampling_loc,
+                              const scalar_t * data_attn_weight,
+                              const int batch_size, 
+                              const int spatial_size, 
+                              const int num_heads,
+                              const int channels, 
+                              const int num_levels,
+                              const int num_query,
+                              const int num_point, 
+                              scalar_t* grad_value,
+                              scalar_t* grad_sampling_loc,
+                              scalar_t* grad_attn_weight)
+{
+  const int num_threads = (channels > CUDA_NUM_THREADS)?CUDA_NUM_THREADS:channels;
+  const int num_kernels = batch_size * num_query * num_heads * channels;
+  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
+  if (channels > 1024)
+  {
+    if ((channels & 1023) == 0)
+    {
+     hipLaunchKernelGGL(( ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks<scalar_t>)
+          , dim3(GET_BLOCKS(num_actual_kernels, num_threads)), dim3(num_threads),
+              num_threads*3*sizeof(scalar_t), stream, 
+                        num_kernels, 
+                        grad_col,
+                        data_value,
+                        data_spatial_shapes,
+                        data_level_start_index, 
+                        data_sampling_loc,
+                        data_attn_weight,
+                        batch_size, 
+                        spatial_size, 
+                        num_heads,
+                        channels, 
+                        num_levels,
+                        num_query,
+                        num_point,
+                        grad_value,
+                        grad_sampling_loc,
+                        grad_attn_weight);
+    }
+    else
+    {
+     hipLaunchKernelGGL(( ms_deformable_col2im_gpu_kernel_gm<scalar_t>)
+        , dim3(GET_BLOCKS(num_actual_kernels, num_threads)), dim3(num_threads),
+            0, stream, 
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+    }
+  }
+  else{
+    switch(channels)
+    {
+      case 1:
+       hipLaunchKernelGGL(( ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 1>)
+        , dim3(GET_BLOCKS(num_actual_kernels, num_threads)), dim3(num_threads),
+            0, stream, 
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 2:
+       hipLaunchKernelGGL(( ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 2>)
+        , dim3(GET_BLOCKS(num_actual_kernels, num_threads)), dim3(num_threads),
+            0, stream, 
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 4:
+       hipLaunchKernelGGL(( ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 4>)
+        , dim3(GET_BLOCKS(num_actual_kernels, num_threads)), dim3(num_threads),
+            0, stream, 
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 8:
+       hipLaunchKernelGGL(( ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 8>)
+        , dim3(GET_BLOCKS(num_actual_kernels, num_threads)), dim3(num_threads),
+            0, stream, 
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 16:
+       hipLaunchKernelGGL(( ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 16>)
+        , dim3(GET_BLOCKS(num_actual_kernels, num_threads)), dim3(num_threads),
+            0, stream, 
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 32:
+       hipLaunchKernelGGL(( ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 32>)
+        , dim3(GET_BLOCKS(num_actual_kernels, num_threads)), dim3(num_threads),
+            0, stream, 
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 64:
+       hipLaunchKernelGGL(( ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 64>)
+        , dim3(GET_BLOCKS(num_actual_kernels, num_threads)), dim3(num_threads),
+            0, stream, 
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 128:
+       hipLaunchKernelGGL(( ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 128>)
+        , dim3(GET_BLOCKS(num_actual_kernels, num_threads)), dim3(num_threads),
+            0, stream, 
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 256:
+       hipLaunchKernelGGL(( ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 256>)
+        , dim3(GET_BLOCKS(num_actual_kernels, num_threads)), dim3(num_threads),
+            0, stream, 
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 512:
+       hipLaunchKernelGGL(( ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 512>)
+        , dim3(GET_BLOCKS(num_actual_kernels, num_threads)), dim3(num_threads),
+            0, stream, 
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 1024:
+       hipLaunchKernelGGL(( ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 1024>)
+        , dim3(GET_BLOCKS(num_actual_kernels, num_threads)), dim3(num_threads),
+            0, stream, 
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      default:
+        if (channels < 64)
+        {
+         hipLaunchKernelGGL(( ms_deformable_col2im_gpu_kernel_shm_reduce_v1<scalar_t>)
+          , dim3(GET_BLOCKS(num_actual_kernels, num_threads)), dim3(num_threads),
+              num_threads*3*sizeof(scalar_t), stream, 
+                        num_kernels, 
+                        grad_col,
+                        data_value,
+                        data_spatial_shapes,
+                        data_level_start_index, 
+                        data_sampling_loc,
+                        data_attn_weight,
+                        batch_size, 
+                        spatial_size, 
+                        num_heads,
+                        channels, 
+                        num_levels,
+                        num_query,
+                        num_point,
+                        grad_value,
+                        grad_sampling_loc,
+                        grad_attn_weight);
+        }
+        else
+        {
+         hipLaunchKernelGGL(( ms_deformable_col2im_gpu_kernel_shm_reduce_v2<scalar_t>)
+          , dim3(GET_BLOCKS(num_actual_kernels, num_threads)), dim3(num_threads),
+              num_threads*3*sizeof(scalar_t), stream, 
+                        num_kernels, 
+                        grad_col,
+                        data_value,
+                        data_spatial_shapes,
+                        data_level_start_index, 
+                        data_sampling_loc,
+                        data_attn_weight,
+                        batch_size, 
+                        spatial_size, 
+                        num_heads,
+                        channels, 
+                        num_levels,
+                        num_query,
+                        num_point,
+                        grad_value,
+                        grad_sampling_loc,
+                        grad_attn_weight);
+        }
+    }
+  }
+  hipError_t err = hipGetLastError();
+  if (err != hipSuccess)
+  {
+    printf("error in ms_deformable_col2im_cuda: %s\n", hipGetErrorString(err));
+  }
+
+}
\ No newline at end of file
--- a/ort_plugin_fp16/CMakeLists.txt
+++ b/ort_plugin_fp16/CMakeLists.txt
+cmake_minimum_required(VERSION 3.14)
+project(GroundingDinoORTPlugin)
+
+# 设置 C++ 标准
+set(CMAKE_CXX_STANDARD 14)
+
+# 告诉 CMake 使用 hipcc 编译器
+set(CMAKE_CXX_COMPILER "hipcc")
+set(CMAKE_C_COMPILER "hipcc")
+
+# 1. 寻找 ONNX Runtime 头文件
+# 请替换为你环境中实际的 onnxruntime_cxx_api.h 所在路径
+set(ONNXRUNTIME_INCLUDE_DIR "/opt/dtk-25.04.2/onnxruntime/include")
+include_directories(${ONNXRUNTIME_INCLUDE_DIR})
+include_directories(.) # 把当前目录加进去，方便找到 .cuh
+
+# 2. 编译我们的共享库 (.so)
+add_library(ms_deform_attn_ort SHARED 
+    ms_deform_attn_ort.hip
+)
+
+# =========================================================================
+# 【关键修复】告诉 CMake 把 .hip 文件当作 C++ 语言来编译和链接
+# =========================================================================
+set_source_files_properties(ms_deform_attn_ort.hip PROPERTIES LANGUAGE CXX)
+set_target_properties(ms_deform_attn_ort PROPERTIES LINKER_LANGUAGE CXX)
+
+# 编译优化
+# target_compile_options(ms_deform_attn_ort PRIVATE -O3 -fPIC)
+target_compile_options(ms_deform_attn_ort PRIVATE -O3 -fPIC --gpu-max-threads-per-block=1024)
\ No newline at end of file
--- a/ort_plugin_fp16/ms_deform_attn_ort.hip
+++ b/ort_plugin_fp16/ms_deform_attn_ort.hip
+// 定义这个宏，关闭 ORT 自动寻找 OrtGetApiBase 的行为
+#define ORT_API_MANUAL_INIT
+
+#include <onnxruntime_c_api.h>
+#include <onnxruntime_cxx_api.h>
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h> // 【修改 1】引入半精度头文件
+#include <vector>
+#include <cmath>
+#include <algorithm>
+
+// 引入核心计算头文件
+#include "ms_deform_im2col_cuda_hip.cuh"
+
+// ============================================================================
+// 1. 定义 Kernel
+// ============================================================================
+struct MsDeformAttnKernel {
+    const OrtApi* api_;
+    int im2col_step_;
+
+    MsDeformAttnKernel(const OrtApi* api, const OrtKernelInfo* info) : api_(api) {
+        int64_t step = 64;
+        OrtStatus* status = api_->KernelInfoGetAttribute_int64(info, "im2col_step_i", &step);
+        if (status != nullptr) {
+            api_->ReleaseStatus(status);
+            im2col_step_ = 64; // 加上兜底赋值
+        } else {
+            im2col_step_ = static_cast<int>(step);
+        }
+        im2col_step_ = std::max(1, im2col_step_);
+    }
+
+    void CheckStatus(OrtStatus* status) {
+        if (status != nullptr) {
+            api_->ReleaseStatus(status);
+        }
+    }
+
+    std::vector<int64_t> GetTensorDims(const OrtValue* tensor) {
+        OrtTensorTypeAndShapeInfo* info;
+        CheckStatus(api_->GetTensorTypeAndShape(tensor, &info));
+        
+        size_t dim_count;
+        CheckStatus(api_->GetDimensionsCount(info, &dim_count));
+        
+        std::vector<int64_t> dims(dim_count);
+        CheckStatus(api_->GetDimensions(info, dims.data(), dim_count));
+        
+        api_->ReleaseTensorTypeAndShapeInfo(info);
+        return dims;
+    }
+
+    void Compute(OrtKernelContext* context) {
+        OrtStatusPtr status = ComputeV2(context);
+        if (status != nullptr) {
+            api_->ReleaseStatus(status);
+        }
+    }
+
+    OrtStatusPtr ComputeV2(OrtKernelContext* context) {
+        const OrtValue* value_tensor;
+        const OrtValue* spatial_shapes_tensor;
+        const OrtValue* level_start_index_tensor;
+        const OrtValue* sampling_loc_tensor;
+        const OrtValue* attn_weight_tensor;
+
+        CheckStatus(api_->KernelContext_GetInput(context, 0, &value_tensor));
+        CheckStatus(api_->KernelContext_GetInput(context, 1, &spatial_shapes_tensor));
+        CheckStatus(api_->KernelContext_GetInput(context, 2, &level_start_index_tensor));
+        CheckStatus(api_->KernelContext_GetInput(context, 3, &sampling_loc_tensor));
+        CheckStatus(api_->KernelContext_GetInput(context, 4, &attn_weight_tensor));
+
+        void* value_ptr;
+        void* spatial_shapes_ptr;
+        void* level_start_index_ptr;
+        void* sampling_loc_ptr;
+        void* attn_weight_ptr;
+
+        CheckStatus(api_->GetTensorMutableData(const_cast<OrtValue*>(value_tensor), &value_ptr));
+        CheckStatus(api_->GetTensorMutableData(const_cast<OrtValue*>(spatial_shapes_tensor), &spatial_shapes_ptr));
+        CheckStatus(api_->GetTensorMutableData(const_cast<OrtValue*>(level_start_index_tensor), &level_start_index_ptr));
+        CheckStatus(api_->GetTensorMutableData(const_cast<OrtValue*>(sampling_loc_tensor), &sampling_loc_ptr));
+        CheckStatus(api_->GetTensorMutableData(const_cast<OrtValue*>(attn_weight_tensor), &attn_weight_ptr));
+
+        // 【修改 2】将 reinterpret_cast 的目标类型从 float 改为 __half
+        const __half* value_data = reinterpret_cast<const __half*>(value_ptr);
+        const int64_t* spatial_shapes_data = reinterpret_cast<const int64_t*>(spatial_shapes_ptr); // 形状保持 int64
+        const int64_t* level_start_index_data = reinterpret_cast<const int64_t*>(level_start_index_ptr); // 索引保持 int64
+        const __half* sampling_loc_data = reinterpret_cast<const __half*>(sampling_loc_ptr);
+        const __half* attn_weight_data = reinterpret_cast<const __half*>(attn_weight_ptr);
+
+        auto value_dims = GetTensorDims(value_tensor);
+        auto spatial_shapes_dims = GetTensorDims(spatial_shapes_tensor);
+        auto sampling_loc_dims = GetTensorDims(sampling_loc_tensor);
+
+        const int batch = value_dims[0];
+        const int spatial_size = value_dims[1];
+        const int num_heads = value_dims[2];
+        const int channels = value_dims[3];
+        const int num_levels = spatial_shapes_dims[0];
+        const int num_query = sampling_loc_dims[1];
+        const int num_point = sampling_loc_dims[4];
+
+        std::vector<int64_t> output_dims = {batch, num_query, num_heads * channels};
+        OrtValue* output_tensor;
+        CheckStatus(api_->KernelContext_GetOutput(context, 0, output_dims.data(), output_dims.size(), &output_tensor));
+        
+        void* output_ptr;
+        CheckStatus(api_->GetTensorMutableData(output_tensor, &output_ptr));
+        
+        // 【修改 3】输出指针强转为 __half
+        __half* output_data = reinterpret_cast<__half*>(output_ptr);
+
+        void* stream_ptr;
+        CheckStatus(api_->KernelContext_GetGPUComputeStream(context, &stream_ptr));
+        hipStream_t stream = reinterpret_cast<hipStream_t>(stream_ptr);
+
+        const int im2col_step_real = std::max(1, std::min(batch, im2col_step_));
+        const int batch_n = im2col_step_real;
+        
+        auto per_value_size = spatial_size * num_heads * channels;
+        auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
+        auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
+        auto per_output_size = batch_n * num_query * num_heads * channels;
+
+        for (int n = 0; n < batch / im2col_step_real; ++n) {
+            // 【修改 4】显式传入模板类型 <__half>
+            ms_deformable_im2col_cuda<__half>(
+                stream,
+                value_data + n * im2col_step_real * per_value_size,
+                spatial_shapes_data,
+                level_start_index_data,
+                sampling_loc_data + n * im2col_step_real * per_sample_loc_size,
+                attn_weight_data + n * im2col_step_real * per_attn_weight_size,
+                batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
+                output_data + n * per_output_size
+            );
+        }
+
+        return nullptr;
+    }
+};
+
+// ============================================================================
+// 2. 算子接口封装
+// ============================================================================
+struct MsDeformAttnCustomOp : Ort::CustomOpBase<MsDeformAttnCustomOp, MsDeformAttnKernel> {
+    
+    void* CreateKernel(const OrtApi& api, const OrtKernelInfo* info) const {
+        return new MsDeformAttnKernel(&api, info);
+    }
+
+    OrtStatusPtr CreateKernelV2(const OrtApi& api, const OrtKernelInfo* info, void** op_kernel) const {
+        *op_kernel = new MsDeformAttnKernel(&api, info);
+        return nullptr;
+    }
+
+    const char* GetName() const { return "ms_deform_attn"; }
+    const char* GetExecutionProviderType() const { return "ROCMExecutionProvider"; }
+
+    size_t GetInputTypeCount() const { return 5; }
+    ONNXTensorElementDataType GetInputType(size_t index) const {
+        // 【修改 5】声明输入 0, 3, 4 为 FLOAT16
+        if (index == 0 || index == 3 || index == 4) return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16;
+        return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
+    }
+
+    size_t GetOutputTypeCount() const { return 1; }
+    ONNXTensorElementDataType GetOutputType(size_t index) const {
+        // 【修改 6】声明输出为 FLOAT16
+        return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16;
+    }
+};
+
+// ============================================================================
+// 3. 注册入口
+// ============================================================================
+MsDeformAttnCustomOp ms_deform_attn_op;
+
+extern "C" OrtStatus* RegisterCustomOps(OrtSessionOptions* options, const OrtApiBase* api) {
+    // 手动初始化 C++ API，把 Python 传来的真实 API 指针交给 ORT
+    Ort::InitApi(api->GetApi(ORT_API_VERSION));
+
+    OrtCustomOpDomain* domain = nullptr;
+    const OrtApi* ortApi = api->GetApi(ORT_API_VERSION);
+
+    if (auto status = ortApi->CreateCustomOpDomain("custom", &domain)) { return status; }
+    if (auto status = ortApi->CustomOpDomain_Add(domain, &ms_deform_attn_op)) { return status; }
+    return ortApi->AddCustomOpDomain(options, domain);
+}
\ No newline at end of file
--- a/ort_plugin_fp16/ms_deform_im2col_cuda_hip.cuh
+++ b/ort_plugin_fp16/ms_deform_im2col_cuda_hip.cuh
+// !!! This is a file automatically generated by hipify!!!
+#include "hip/hip_runtime.h"
+#include <hip/hip_fp16.h> // 【修改 1】引入半精度支持
+
+/*!
+**************************************************************************
+* Deformable DETR (Inference Only - Forward Pass)
+**************************************************************************
+*/
+
+#include <cstdio>
+#include <algorithm>
+#include <cstring>
+
+#define CUDA_KERNEL_LOOP(i, n)                          \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x;   \
+      i < (n);                                          \
+      i += blockDim.x * gridDim.x)
+
+const int CUDA_NUM_THREADS = 1024;
+inline int GET_BLOCKS(const int N, const int num_threads)
+{
+  return (N + num_threads - 1) / num_threads;
+}
+
+// =================================================================================
+// 核心数学计算：双线性插值 (Device 侧)
+// 【修改 2】返回值强制改为 float，确保累加精度；坐标参数 h 和 w 也改为 float
+// =================================================================================
+template <typename scalar_t>
+__device__ float ms_deform_attn_im2col_bilinear(const scalar_t* &bottom_data, 
+                                                   const int &height, const int &width, const int &nheads, const int &channels,
+                                                   const float &h, const float &w, const int &m, const int &c)
+{
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  // 这里的系数计算全程在 FP32 下进行
+  const float lh = h - h_low;
+  const float lw = w - w_low;
+  const float hh = 1.0f - lh, hw = 1.0f - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  // 【修改 3】从显存读取 FP16 (scalar_t) 数据后，立即转换为 float
+  float v1 = 0.0f;
+  if (h_low >= 0 && w_low >= 0)
+  {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = static_cast<float>(bottom_data[ptr1]);
+  }
+  
+  float v2 = 0.0f;
+  if (h_low >= 0 && w_high <= width - 1)
+  {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = static_cast<float>(bottom_data[ptr2]);
+  }
+  
+  float v3 = 0.0f;
+  if (h_high <= height - 1 && w_low >= 0)
+  {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = static_cast<float>(bottom_data[ptr3]);
+  }
+  
+  float v4 = 0.0f;
+  if (h_high <= height - 1 && w_high <= width - 1)
+  {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = static_cast<float>(bottom_data[ptr4]);
+  }
+
+  const float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  // FP32 下的加权求和，防止溢出或下溢
+  const float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+// =================================================================================
+// 核心调度内核：循环多尺度特征与采样点 (Global 侧)
+// =================================================================================
+template <typename scalar_t>
+__global__ void ms_deformable_im2col_gpu_kernel(const int n,
+                                                const scalar_t *data_value, 
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *data_col)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    scalar_t *data_col_ptr = data_col + index;
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+    
+    // 【修改 4】特征累加器必须是 float，防止多次累加造成 16 位浮点数溢出
+    float col = 0.0f; 
+    
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const scalar_t *data_value_ptr = data_value + (data_value_ptr_init_offset + level_start_id * qid_stride);
+      
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        // 【修改 5】极其关键：读取采样坐标和注意力权重时，立刻强转为 float
+        const float loc_w = static_cast<float>(data_sampling_loc[data_loc_w_ptr]);
+        const float loc_h = static_cast<float>(data_sampling_loc[data_loc_w_ptr + 1]);
+        const float weight = static_cast<float>(data_attn_weight[data_weight_ptr]);
+
+        // 在 FP32 下计算实际坐标，保证精度
+        const float h_im = loc_h * spatial_h - 0.5f;
+        const float w_im = loc_w * spatial_w - 0.5f;
+
+        if (h_im > -1.0f && w_im > -1.0f && h_im < spatial_h && w_im < spatial_w)
+        {
+          // 这里的 ms_deform_attn_im2col_bilinear 已经改写为返回 float
+          col += ms_deform_attn_im2col_bilinear(data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col) * weight;
+        }
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+      }
+    }
+    // 【修改 6】算完之后，把 float 强转回 FP16 (scalar_t) 写回显存
+    *data_col_ptr = static_cast<scalar_t>(col);
+  }
+}
+
+// =================================================================================
+// C++ 主机侧调用入口 (Launcher)
+// =================================================================================
+template <typename scalar_t>
+void ms_deformable_im2col_cuda(hipStream_t stream,
+                              const scalar_t* data_value,
+                              const int64_t* data_spatial_shapes, 
+                              const int64_t* data_level_start_index, 
+                              const scalar_t* data_sampling_loc,
+                              const scalar_t* data_attn_weight,
+                              const int batch_size,
+                              const int spatial_size, 
+                              const int num_heads, 
+                              const int channels, 
+                              const int num_levels, 
+                              const int num_query,
+                              const int num_point,
+                              scalar_t* data_col)
+{
+  const int num_kernels = batch_size * num_query * num_heads * channels;
+  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
+  const int num_threads = CUDA_NUM_THREADS;
+  
+  hipLaunchKernelGGL(( ms_deformable_im2col_gpu_kernel<scalar_t>)
+      , dim3(GET_BLOCKS(num_actual_kernels, num_threads)), dim3(num_threads),
+          0, stream, 
+      num_kernels, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, 
+      batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, data_col);
+  
+  hipError_t err = hipGetLastError();
+  if (err != hipSuccess)
+  {
+    printf("error in ms_deformable_im2col_cuda: %s\n", hipGetErrorString(err));
+  }
+}
\ No newline at end of file
--- a/ort_plugin_fp16_B/CMakeLists.txt
+++ b/ort_plugin_fp16_B/CMakeLists.txt
+cmake_minimum_required(VERSION 3.14)
+project(GroundingDinoORTPlugin)
+
+# 设置 C++ 标准
+set(CMAKE_CXX_STANDARD 14)
+
+# 告诉 CMake 使用 hipcc 编译器
+set(CMAKE_CXX_COMPILER "hipcc")
+set(CMAKE_C_COMPILER "hipcc")
+
+# 1. 寻找 ONNX Runtime 头文件
+# 请替换为你环境中实际的 onnxruntime_cxx_api.h 所在路径
+set(ONNXRUNTIME_INCLUDE_DIR "/opt/dtk-25.04.2/onnxruntime/include")
+include_directories(${ONNXRUNTIME_INCLUDE_DIR})
+include_directories(.) # 把当前目录加进去，方便找到 .cuh
+
+# 2. 编译我们的共享库 (.so)
+add_library(ms_deform_attn_ort SHARED 
+    ms_deform_attn_ort.hip
+)
+
+# =========================================================================
+# 【关键修复】告诉 CMake 把 .hip 文件当作 C++ 语言来编译和链接
+# =========================================================================
+set_source_files_properties(ms_deform_attn_ort.hip PROPERTIES LANGUAGE CXX)
+set_target_properties(ms_deform_attn_ort PROPERTIES LINKER_LANGUAGE CXX)
+
+# 编译优化
+# target_compile_options(ms_deform_attn_ort PRIVATE -O3 -fPIC)
+target_compile_options(ms_deform_attn_ort PRIVATE -O3 -fPIC --gpu-max-threads-per-block=1024)
\ No newline at end of file
--- a/ort_plugin_fp16_B/ms_deform_attn_ort.hip
+++ b/ort_plugin_fp16_B/ms_deform_attn_ort.hip
+// 关键修复 1：定义这个宏，关闭 ORT 自动寻找 OrtGetApiBase 的行为
+#define ORT_API_MANUAL_INIT
+
+#include <onnxruntime_c_api.h>
+#include <onnxruntime_cxx_api.h>
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+#include <vector>
+#include <cmath>
+#include <algorithm>
+
+// 引入核心计算头文件
+#include "ms_deform_im2col_cuda_hip.cuh"
+
+// ============================================================================
+// 1. 定义 Kernel
+// ============================================================================
+struct MsDeformAttnKernel {
+    const OrtApi* api_;
+    int im2col_step_;
+
+    // ========================================================================
+    // 【终极优化：对象级显存池】作为成员变量，伴随算子生命周期永久存在
+    // ========================================================================
+    float* v_f32_ = nullptr;
+    size_t v_cap_ = 0;
+    
+    float* sl_f32_ = nullptr;
+    size_t sl_cap_ = 0;
+    
+    float* aw_f32_ = nullptr;
+    size_t aw_cap_ = 0;
+    
+    float* out_f32_ = nullptr;
+    size_t out_cap_ = 0;
+
+    MsDeformAttnKernel(const OrtApi* api, const OrtKernelInfo* info) : api_(api) {
+        int64_t step = 64;
+        OrtStatus* status = api_->KernelInfoGetAttribute_int64(info, "im2col_step_i", &step);
+        if (status != nullptr) {
+            api_->ReleaseStatus(status);
+            im2col_step_ = 64; 
+        } else {
+            im2col_step_ = static_cast<int>(step);
+        }
+        im2col_step_ = std::max(1, im2col_step_);
+    }
+
+    // 析构函数：模型被卸载时，自动释放显存，绝不内存泄漏！
+    ~MsDeformAttnKernel() {
+        if (v_f32_) hipFree(v_f32_);
+        if (sl_f32_) hipFree(sl_f32_);
+        if (aw_f32_) hipFree(aw_f32_);
+        if (out_f32_) hipFree(out_f32_);
+    }
+
+    void CheckStatus(OrtStatus* status) {
+        if (status != nullptr) {
+            api_->ReleaseStatus(status);
+        }
+    }
+
+    std::vector<int64_t> GetTensorDims(const OrtValue* tensor) {
+        OrtTensorTypeAndShapeInfo* info;
+        CheckStatus(api_->GetTensorTypeAndShape(tensor, &info));
+        size_t dim_count;
+        CheckStatus(api_->GetDimensionsCount(info, &dim_count));
+        std::vector<int64_t> dims(dim_count);
+        CheckStatus(api_->GetDimensions(info, dims.data(), dim_count));
+        api_->ReleaseTensorTypeAndShapeInfo(info);
+        return dims;
+    }
+
+    void Compute(OrtKernelContext* context) {
+        OrtStatusPtr status = ComputeV2(context);
+        if (status != nullptr) {
+            api_->ReleaseStatus(status);
+        }
+    }
+
+    OrtStatusPtr ComputeV2(OrtKernelContext* context) {
+        const OrtValue* value_tensor;
+        const OrtValue* spatial_shapes_tensor;
+        const OrtValue* level_start_index_tensor;
+        const OrtValue* sampling_loc_tensor;
+        const OrtValue* attn_weight_tensor;
+
+        CheckStatus(api_->KernelContext_GetInput(context, 0, &value_tensor));
+        CheckStatus(api_->KernelContext_GetInput(context, 1, &spatial_shapes_tensor));
+        CheckStatus(api_->KernelContext_GetInput(context, 2, &level_start_index_tensor));
+        CheckStatus(api_->KernelContext_GetInput(context, 3, &sampling_loc_tensor));
+        CheckStatus(api_->KernelContext_GetInput(context, 4, &attn_weight_tensor));
+
+        void *value_ptr, *spatial_shapes_ptr, *level_start_index_ptr, *sampling_loc_ptr, *attn_weight_ptr;
+        CheckStatus(api_->GetTensorMutableData(const_cast<OrtValue*>(value_tensor), &value_ptr));
+        CheckStatus(api_->GetTensorMutableData(const_cast<OrtValue*>(spatial_shapes_tensor), &spatial_shapes_ptr));
+        CheckStatus(api_->GetTensorMutableData(const_cast<OrtValue*>(level_start_index_tensor), &level_start_index_ptr));
+        CheckStatus(api_->GetTensorMutableData(const_cast<OrtValue*>(sampling_loc_tensor), &sampling_loc_ptr));
+        CheckStatus(api_->GetTensorMutableData(const_cast<OrtValue*>(attn_weight_tensor), &attn_weight_ptr));
+
+        auto value_dims = GetTensorDims(value_tensor);
+        auto spatial_shapes_dims = GetTensorDims(spatial_shapes_tensor);
+        auto sampling_loc_dims = GetTensorDims(sampling_loc_tensor);
+
+        const int batch = value_dims[0];
+        const int spatial_size = value_dims[1];
+        const int num_heads = value_dims[2];
+        const int channels = value_dims[3];
+        const int num_levels = spatial_shapes_dims[0];
+        const int num_query = sampling_loc_dims[1];
+        const int num_point = sampling_loc_dims[4];
+
+        std::vector<int64_t> output_dims = {batch, num_query, num_heads * channels};
+        OrtValue* output_tensor;
+        CheckStatus(api_->KernelContext_GetOutput(context, 0, output_dims.data(), output_dims.size(), &output_tensor));
+        
+        void* output_ptr;
+        CheckStatus(api_->GetTensorMutableData(output_tensor, &output_ptr));
+
+        void* stream_ptr;
+        CheckStatus(api_->KernelContext_GetGPUComputeStream(context, &stream_ptr));
+        hipStream_t stream = reinterpret_cast<hipStream_t>(stream_ptr);
+
+        // ============================================================================
+        // 【极速显存调度】：只在第一帧分配显存，或者当模型输入图片变大时自动扩容！
+        // ============================================================================
+        size_t v_bytes = batch * spatial_size * num_heads * channels * sizeof(float);
+        size_t sl_bytes = batch * num_query * num_heads * num_levels * num_point * 2 * sizeof(float);
+        size_t aw_bytes = batch * num_query * num_heads * num_levels * num_point * sizeof(float);
+        size_t out_bytes = batch * num_query * num_heads * channels * sizeof(float);
+
+        bool need_sync = (v_bytes > v_cap_) || (sl_bytes > sl_cap_) || (aw_bytes > aw_cap_) || (out_bytes > out_cap_);
+        
+        if (need_sync) {
+            // 如果遇到更大的新分辨率，安全释放旧内存重新分配
+            hipStreamSynchronize(stream); 
+            if (v_bytes > v_cap_) { if (v_f32_) hipFree(v_f32_); hipMalloc((void**)&v_f32_, v_bytes); v_cap_ = v_bytes; }
+            if (sl_bytes > sl_cap_) { if (sl_f32_) hipFree(sl_f32_); hipMalloc((void**)&sl_f32_, sl_bytes); sl_cap_ = sl_bytes; }
+            if (aw_bytes > aw_cap_) { if (aw_f32_) hipFree(aw_f32_); hipMalloc((void**)&aw_f32_, aw_bytes); aw_cap_ = aw_bytes; }
+            if (out_bytes > out_cap_) { if (out_f32_) hipFree(out_f32_); hipMalloc((void**)&out_f32_, out_bytes); out_cap_ = out_bytes; }
+        }
+
+        // ============================================================================
+        // 数据流转：FP16 -> FP32 (计算) -> FP16
+        // ============================================================================
+        int v_cnt = v_bytes / sizeof(float);
+        int sl_cnt = sl_bytes / sizeof(float);
+        int aw_cnt = aw_bytes / sizeof(float);
+        int out_cnt = out_bytes / sizeof(float);
+
+        RunFastCast<__half, float>(stream, reinterpret_cast<const __half*>(value_ptr), v_f32_, v_cnt);
+        RunFastCast<__half, float>(stream, reinterpret_cast<const __half*>(sampling_loc_ptr), sl_f32_, sl_cnt);
+        RunFastCast<__half, float>(stream, reinterpret_cast<const __half*>(attn_weight_ptr), aw_f32_, aw_cnt);
+
+        const int64_t* spatial_shapes_data = reinterpret_cast<const int64_t*>(spatial_shapes_ptr);
+        const int64_t* level_start_index_data = reinterpret_cast<const int64_t*>(level_start_index_ptr);
+
+        const int im2col_step_real = std::max(1, std::min(batch, im2col_step_));
+        const int batch_n = im2col_step_real;
+        
+        auto per_value_size = spatial_size * num_heads * channels;
+        auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
+        auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
+        auto per_output_size = batch_n * num_query * num_heads * channels;
+
+        for (int n = 0; n < batch / im2col_step_real; ++n) {
+            ms_deformable_im2col_cuda_fp32(
+                stream,
+                v_f32_ + n * im2col_step_real * per_value_size,
+                spatial_shapes_data,
+                level_start_index_data,
+                sl_f32_ + n * im2col_step_real * per_sample_loc_size,
+                aw_f32_ + n * im2col_step_real * per_attn_weight_size,
+                batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
+                out_f32_ + n * per_output_size
+            );
+        }
+
+        RunFastCast<float, __half>(stream, out_f32_, reinterpret_cast<__half*>(output_ptr), out_cnt);
+
+        // 我们这里不再去调用 hipFree 释放内存！把内存养着，下一帧接着用！
+        return nullptr;
+    }
+};
+
+// ============================================================================
+// 2. 算子接口封装
+// ============================================================================
+struct MsDeformAttnCustomOp : Ort::CustomOpBase<MsDeformAttnCustomOp, MsDeformAttnKernel> {
+    
+    void* CreateKernel(const OrtApi& api, const OrtKernelInfo* info) const {
+        return new MsDeformAttnKernel(&api, info);
+    }
+
+    OrtStatusPtr CreateKernelV2(const OrtApi& api, const OrtKernelInfo* info, void** op_kernel) const {
+        *op_kernel = new MsDeformAttnKernel(&api, info);
+        return nullptr;
+    }
+
+    const char* GetName() const { return "ms_deform_attn"; }
+    const char* GetExecutionProviderType() const { return "ROCMExecutionProvider"; }
+
+    size_t GetInputTypeCount() const { return 5; }
+    ONNXTensorElementDataType GetInputType(size_t index) const {
+        if (index == 0 || index == 3 || index == 4) return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16;
+        return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
+    }
+
+    size_t GetOutputTypeCount() const { return 1; }
+    ONNXTensorElementDataType GetOutputType(size_t index) const {
+        return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16;
+    }
+};
+
+// ============================================================================
+// 3. 注册入口
+// ============================================================================
+MsDeformAttnCustomOp ms_deform_attn_op;
+
+extern "C" OrtStatus* RegisterCustomOps(OrtSessionOptions* options, const OrtApiBase* api) {
+    Ort::InitApi(api->GetApi(ORT_API_VERSION));
+
+    OrtCustomOpDomain* domain = nullptr;
+    const OrtApi* ortApi = api->GetApi(ORT_API_VERSION);
+
+    if (auto status = ortApi->CreateCustomOpDomain("custom", &domain)) { return status; }
+    if (auto status = ortApi->CustomOpDomain_Add(domain, &ms_deform_attn_op)) { return status; }
+    return ortApi->AddCustomOpDomain(options, domain);
+}
\ No newline at end of file
--- a/ort_plugin_fp16_B/ms_deform_im2col_cuda_hip.cuh
+++ b/ort_plugin_fp16_B/ms_deform_im2col_cuda_hip.cuh
+#include "hip/hip_runtime.h"
+#include <hip/hip_fp16.h>
+#include <cstdio>
+#include <algorithm>
+
+#define CUDA_KERNEL_LOOP(i, n) \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
+
+const int CUDA_NUM_THREADS = 1024;
+inline int GET_BLOCKS(const int N, const int num_threads) {
+  return (N + num_threads - 1) / num_threads;
+}
+
+// ============================================================================
+// 【方案 B】极速就地转换核函数 (FP16 <-> FP32)
+// ============================================================================
+template <typename T_IN, typename T_OUT>
+__global__ void FastCastKernel(const T_IN* input, T_OUT* output, int n) {
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < n) {
+        output[i] = static_cast<T_OUT>(input[i]);
+    }
+}
+
+template <typename T_IN, typename T_OUT>
+void RunFastCast(hipStream_t stream, const T_IN* input, T_OUT* output, int n) {
+    if (n <= 0) return;
+    const int threads = 512;
+    const int blocks = (n + threads - 1) / threads;
+    // 【关键修复 1】使用 <<< >>> 语法，完美避开 hipLaunchKernelGGL 宏解析带逗号模板的 Bug
+    FastCastKernel<T_IN, T_OUT><<<blocks, threads, 0, stream>>>(input, output, n);
+}
+
+// ============================================================================
+// FP32 双线性插值计算 (Device)
+// ============================================================================
+__device__ float ms_deform_attn_im2col_bilinear_fp32(const float* &bottom_data, 
+                                                   const int &height, const int &width, const int &nheads, const int &channels,
+                                                   const float &h, const float &w, const int &m, const int &c)
+{
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+  const float lh = h - h_low;
+  const float lw = w - w_low;
+  const float hh = 1.0f - lh, hw = 1.0f - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  float v1 = 0;
+  if (h_low >= 0 && w_low >= 0) {
+    v1 = bottom_data[h_low_ptr_offset + w_low_ptr_offset + base_ptr];
+  }
+  float v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1) {
+    v2 = bottom_data[h_low_ptr_offset + w_high_ptr_offset + base_ptr];
+  }
+  float v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0) {
+    v3 = bottom_data[h_high_ptr_offset + w_low_ptr_offset + base_ptr];
+  }
+  float v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1) {
+    v4 = bottom_data[h_high_ptr_offset + w_high_ptr_offset + base_ptr];
+  }
+
+  return (hh * hw * v1 + hh * lw * v2 + lh * hw * v3 + lh * lw * v4);
+}
+
+// ============================================================================
+// FP32 核心计算内核 (Global)
+// ============================================================================
+__global__ void ms_deformable_im2col_gpu_kernel_fp32(const int n,
+                                                const float *data_value, 
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const float *data_sampling_loc,
+                                                const float *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                float *data_col)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    int _temp = index;
+    const int c_col = _temp % channels; _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads; _temp /= num_heads;
+    const int q_col = _temp % num_query; _temp /= num_query;
+    const int b_col = _temp;
+
+    float *data_col_ptr = data_col + index;
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+    float col = 0.0f;
+    
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const float *data_value_ptr = data_value + (data_value_ptr_init_offset + level_start_id * qid_stride);
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const float loc_w = data_sampling_loc[data_loc_w_ptr];
+        const float loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const float weight = data_attn_weight[data_weight_ptr];
+
+        const float h_im = loc_h * spatial_h - 0.5f;
+        const float w_im = loc_w * spatial_w - 0.5f;
+
+        if (h_im > -1.0f && w_im > -1.0f && h_im < spatial_h && w_im < spatial_w)
+        {
+          col += ms_deform_attn_im2col_bilinear_fp32(data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col) * weight;
+        }
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+      }
+    }
+    *data_col_ptr = col;
+  }
+}
+
+// ============================================================================
+// 算子启动函数 (Launcher)
+// ============================================================================
+void ms_deformable_im2col_cuda_fp32(hipStream_t stream,
+                              const float* data_value,
+                              const int64_t* data_spatial_shapes, 
+                              const int64_t* data_level_start_index, 
+                              const float* data_sampling_loc,
+                              const float* data_attn_weight,
+                              const int batch_size,
+                              const int spatial_size, 
+                              const int num_heads, 
+                              const int channels, 
+                              const int num_levels, 
+                              const int num_query,
+                              const int num_point,
+                              float* data_col)
+{
+  const int num_kernels = batch_size * num_query * num_heads * channels;
+  const int num_threads = CUDA_NUM_THREADS;
+  
+  // 【关键修复 2】同样改用 <<< >>> 语法，避免宏展开的坑
+  ms_deformable_im2col_gpu_kernel_fp32<<<GET_BLOCKS(num_kernels, num_threads), num_threads, 0, stream>>>(
+      num_kernels, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, 
+      batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, data_col);
+}
\ No newline at end of file
--- a/ort_plugin_fp16_C/CMakeLists.txt
+++ b/ort_plugin_fp16_C/CMakeLists.txt
+cmake_minimum_required(VERSION 3.14)
+project(GroundingDinoORTPlugin)
+
+# 设置 C++ 标准
+set(CMAKE_CXX_STANDARD 14)
+
+# 告诉 CMake 使用 hipcc 编译器
+set(CMAKE_CXX_COMPILER "hipcc")
+set(CMAKE_C_COMPILER "hipcc")
+
+# 1. 寻找 ONNX Runtime 头文件
+# 请替换为你环境中实际的 onnxruntime_cxx_api.h 所在路径
+set(ONNXRUNTIME_INCLUDE_DIR "/opt/dtk-25.04.2/onnxruntime/include")
+include_directories(${ONNXRUNTIME_INCLUDE_DIR})
+include_directories(.) # 把当前目录加进去，方便找到 .cuh
+
+# 2. 编译我们的共享库 (.so)
+add_library(ms_deform_attn_ort SHARED 
+    ms_deform_attn_ort.hip
+)
+
+# =========================================================================
+# 【关键修复】告诉 CMake 把 .hip 文件当作 C++ 语言来编译和链接
+# =========================================================================
+set_source_files_properties(ms_deform_attn_ort.hip PROPERTIES LANGUAGE CXX)
+set_target_properties(ms_deform_attn_ort PROPERTIES LINKER_LANGUAGE CXX)
+
+# 编译优化
+# target_compile_options(ms_deform_attn_ort PRIVATE -O3 -fPIC)
+target_compile_options(ms_deform_attn_ort PRIVATE -O3 -fPIC --gpu-max-threads-per-block=1024)
\ No newline at end of file
--- a/ort_plugin_fp16_C/ms_deform_attn_ort.hip
+++ b/ort_plugin_fp16_C/ms_deform_attn_ort.hip
+#define ORT_API_MANUAL_INIT
+
+#include <onnxruntime_c_api.h>
+#include <onnxruntime_cxx_api.h>
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+#include <vector>
+#include <cmath>
+#include <algorithm>
+
+// 引入核心计算头文件
+#include "ms_deform_im2col_cuda_hip.cuh"
+
+// ============================================================================
+// 1. 定义 Kernel (极致干净的 Native FP16 接口)
+// ============================================================================
+struct MsDeformAttnKernel {
+    const OrtApi* api_;
+    int im2col_step_;
+
+    MsDeformAttnKernel(const OrtApi* api, const OrtKernelInfo* info) : api_(api) {
+        int64_t step = 64;
+        OrtStatus* status = api_->KernelInfoGetAttribute_int64(info, "im2col_step_i", &step);
+        if (status != nullptr) {
+            api_->ReleaseStatus(status);
+            im2col_step_ = 64; 
+        } else {
+            im2col_step_ = static_cast<int>(step);
+        }
+        im2col_step_ = std::max(1, im2col_step_);
+    }
+
+    void CheckStatus(OrtStatus* status) {
+        if (status != nullptr) api_->ReleaseStatus(status);
+    }
+
+    std::vector<int64_t> GetTensorDims(const OrtValue* tensor) {
+        OrtTensorTypeAndShapeInfo* info;
+        CheckStatus(api_->GetTensorTypeAndShape(tensor, &info));
+        size_t dim_count;
+        CheckStatus(api_->GetDimensionsCount(info, &dim_count));
+        std::vector<int64_t> dims(dim_count);
+        CheckStatus(api_->GetDimensions(info, dims.data(), dim_count));
+        api_->ReleaseTensorTypeAndShapeInfo(info);
+        return dims;
+    }
+
+    void Compute(OrtKernelContext* context) {
+        OrtStatusPtr status = ComputeV2(context);
+        if (status != nullptr) api_->ReleaseStatus(status);
+    }
+
+    OrtStatusPtr ComputeV2(OrtKernelContext* context) {
+        const OrtValue *v_t, *ss_t, *lsi_t, *sl_t, *aw_t;
+        CheckStatus(api_->KernelContext_GetInput(context, 0, &v_t));
+        CheckStatus(api_->KernelContext_GetInput(context, 1, &ss_t));
+        CheckStatus(api_->KernelContext_GetInput(context, 2, &lsi_t));
+        CheckStatus(api_->KernelContext_GetInput(context, 3, &sl_t));
+        CheckStatus(api_->KernelContext_GetInput(context, 4, &aw_t));
+
+        void *v_p, *ss_p, *lsi_p, *sl_p, *aw_p;
+        CheckStatus(api_->GetTensorMutableData(const_cast<OrtValue*>(v_t), &v_p));
+        CheckStatus(api_->GetTensorMutableData(const_cast<OrtValue*>(ss_t), &ss_p));
+        CheckStatus(api_->GetTensorMutableData(const_cast<OrtValue*>(lsi_t), &lsi_p));
+        CheckStatus(api_->GetTensorMutableData(const_cast<OrtValue*>(sl_t), &sl_p));
+        CheckStatus(api_->GetTensorMutableData(const_cast<OrtValue*>(aw_t), &aw_p));
+
+        auto v_dims = GetTensorDims(v_t);
+        auto sl_dims = GetTensorDims(sl_t);
+        auto ss_dims = GetTensorDims(ss_t);
+
+        const int batch = v_dims[0];
+        const int spatial_size = v_dims[1];
+        const int num_heads = v_dims[2];
+        const int channels = v_dims[3];
+        const int num_levels = ss_dims[0];
+        const int num_query = sl_dims[1];
+        const int num_point = sl_dims[4];
+
+        std::vector<int64_t> output_dims = {batch, num_query, num_heads * channels};
+        OrtValue* output_tensor;
+        CheckStatus(api_->KernelContext_GetOutput(context, 0, output_dims.data(), output_dims.size(), &output_tensor));
+        
+        void* output_ptr;
+        CheckStatus(api_->GetTensorMutableData(output_tensor, &output_ptr));
+
+        void* stream_ptr;
+        CheckStatus(api_->KernelContext_GetGPUComputeStream(context, &stream_ptr));
+        hipStream_t stream = reinterpret_cast<hipStream_t>(stream_ptr);
+
+        // 强转为 __half 原生指针
+        const __half* v_f16 = reinterpret_cast<const __half*>(v_p);
+        const __half* sl_f16 = reinterpret_cast<const __half*>(sl_p);
+        const __half* aw_f16 = reinterpret_cast<const __half*>(aw_p);
+        __half* out_f16 = reinterpret_cast<__half*>(output_ptr);
+        
+        const int64_t* ss_i64 = reinterpret_cast<const int64_t*>(ss_p);
+        const int64_t* lsi_i64 = reinterpret_cast<const int64_t*>(lsi_p);
+
+        const int im2col_step_real = std::max(1, std::min(batch, im2col_step_));
+        const int batch_n = im2col_step_real;
+        
+        auto per_value_size = spatial_size * num_heads * channels;
+        auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
+        auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
+        auto per_output_size = batch_n * num_query * num_heads * channels;
+
+        // 核心突破：摒弃所有 Cast，直接传给 half2 极速内核！
+        for (int n = 0; n < batch / im2col_step_real; ++n) {
+            ms_deformable_im2col_cuda_half2(
+                stream,
+                v_f16 + n * im2col_step_real * per_value_size,
+                ss_i64,
+                lsi_i64,
+                sl_f16 + n * im2col_step_real * per_sample_loc_size,
+                aw_f16 + n * im2col_step_real * per_attn_weight_size,
+                batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
+                out_f16 + n * per_output_size
+            );
+        }
+
+        return nullptr;
+    }
+};
+
+// ============================================================================
+// 2. 算子接口封装
+// ============================================================================
+struct MsDeformAttnCustomOp : Ort::CustomOpBase<MsDeformAttnCustomOp, MsDeformAttnKernel> {
+    
+    void* CreateKernel(const OrtApi& api, const OrtKernelInfo* info) const {
+        return new MsDeformAttnKernel(&api, info);
+    }
+
+    OrtStatusPtr CreateKernelV2(const OrtApi& api, const OrtKernelInfo* info, void** op_kernel) const {
+        *op_kernel = new MsDeformAttnKernel(&api, info);
+        return nullptr;
+    }
+
+    const char* GetName() const { return "ms_deform_attn"; }
+    const char* GetExecutionProviderType() const { return "ROCMExecutionProvider"; }
+
+    size_t GetInputTypeCount() const { return 5; }
+    ONNXTensorElementDataType GetInputType(size_t index) const {
+        if (index == 0 || index == 3 || index == 4) return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16;
+        return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
+    }
+
+    size_t GetOutputTypeCount() const { return 1; }
+    ONNXTensorElementDataType GetOutputType(size_t index) const {
+        return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16;
+    }
+};
+
+// ============================================================================
+// 3. 注册入口
+// ============================================================================
+MsDeformAttnCustomOp ms_deform_attn_op;
+
+extern "C" OrtStatus* RegisterCustomOps(OrtSessionOptions* options, const OrtApiBase* api) {
+    Ort::InitApi(api->GetApi(ORT_API_VERSION));
+    OrtCustomOpDomain* domain = nullptr;
+    const OrtApi* ortApi = api->GetApi(ORT_API_VERSION);
+
+    if (auto status = ortApi->CreateCustomOpDomain("custom", &domain)) { return status; }
+    if (auto status = ortApi->CustomOpDomain_Add(domain, &ms_deform_attn_op)) { return status; }
+    return ortApi->AddCustomOpDomain(options, domain);
+}
\ No newline at end of file
--- a/ort_plugin_fp16_C/ms_deform_im2col_cuda_hip.cuh
+++ b/ort_plugin_fp16_C/ms_deform_im2col_cuda_hip.cuh
+#include "hip/hip_runtime.h"
+#include <hip/hip_fp16.h>
+#include <cstdio>
+#include <algorithm>
+
+#define CUDA_KERNEL_LOOP(i, n) \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
+
+const int CUDA_NUM_THREADS = 1024;
+inline int GET_BLOCKS(const int N, const int num_threads) {
+  return (N + num_threads - 1) / num_threads;
+}
+
+// ============================================================================
+// 【方案 A】极致向量化双线性插值 (一次计算 2 个 Channel)
+// ============================================================================
+__device__ float2 ms_deform_attn_im2col_bilinear_half2(const half2* bottom_data, 
+                                                   const int height, const int width, const int nheads, const int channels_half2,
+                                                   const float h, const float w, const int m, const int c_half2)
+{
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+  const float lh = h - h_low;
+  const float lw = w - w_low;
+  const float hh = 1.0f - lh, hw = 1.0f - lw;
+
+  // 注意：这里的 stride 都是以 half2 为单位计算的
+  const int w_stride = nheads * channels_half2;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels_half2 + c_half2;
+
+  // 生成一个包含两个 0.0 的 half2
+  half2 zero_h2 = __float2half2_rn(0.0f);
+
+  // 极其高效的 32-bit (half2) 一次性拉取
+  half2 v1_h2 = (h_low >= 0 && w_low >= 0) ? bottom_data[h_low_ptr_offset + w_low_ptr_offset + base_ptr] : zero_h2;
+  half2 v2_h2 = (h_low >= 0 && w_high <= width - 1) ? bottom_data[h_low_ptr_offset + w_high_ptr_offset + base_ptr] : zero_h2;
+  half2 v3_h2 = (h_high <= height - 1 && w_low >= 0) ? bottom_data[h_high_ptr_offset + w_low_ptr_offset + base_ptr] : zero_h2;
+  half2 v4_h2 = (h_high <= height - 1 && w_high <= width - 1) ? bottom_data[h_high_ptr_offset + w_high_ptr_offset + base_ptr] : zero_h2;
+
+  // 内部无损转换为 float2 进行高精度累加
+  float2 v1 = __half22float2(v1_h2);
+  float2 v2 = __half22float2(v2_h2);
+  float2 v3 = __half22float2(v3_h2);
+  float2 v4 = __half22float2(v4_h2);
+
+  float2 res;
+  res.x = hh * hw * v1.x + hh * lw * v2.x + lh * hw * v3.x + lh * lw * v4.x;
+  res.y = hh * hw * v1.y + hh * lw * v2.y + lh * hw * v3.y + lh * lw * v4.y;
+  return res;
+}
+
+// ============================================================================
+// 【方案 A】原生 FP16 + Half2 向量化计算内核
+// ============================================================================
+__global__ void ms_deformable_im2col_gpu_kernel_half2(const int n,
+                                                const half2 *data_value, 
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const __half *data_sampling_loc,
+                                                const __half *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                half2 *data_col)
+{
+  // n 是 (batch * num_query * num_heads * (channels / 2))
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    int _temp = index;
+    const int channels_half2 = channels / 2; // 通道数砍半
+    const int c_col_half2 = _temp % channels_half2; _temp /= channels_half2;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads; _temp /= num_heads;
+    const int q_col = _temp % num_query; _temp /= num_query;
+    const int b_col = _temp;
+
+    half2 *data_col_ptr = data_col + index;
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    
+    const int qid_stride_half2 = num_heads * channels_half2;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride_half2;
+    
+    // 用 float2 当作累加器，杜绝 FP16 加法溢出
+    float2 col;
+    col.x = 0.0f; col.y = 0.0f;
+    
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const half2 *data_value_ptr = data_value + (data_value_ptr_init_offset + level_start_id * qid_stride_half2);
+      
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        // 采样坐标和权重依然是单个的 __half，这里强转为 float
+        const float loc_w = __half2float(data_sampling_loc[data_loc_w_ptr]);
+        const float loc_h = __half2float(data_sampling_loc[data_loc_w_ptr + 1]);
+        const float weight = __half2float(data_attn_weight[data_weight_ptr]);
+
+        const float h_im = loc_h * spatial_h - 0.5f;
+        const float w_im = loc_w * spatial_w - 0.5f;
+
+        if (h_im > -1.0f && w_im > -1.0f && h_im < spatial_h && w_im < spatial_w)
+        {
+          float2 val = ms_deform_attn_im2col_bilinear_half2(data_value_ptr, spatial_h, spatial_w, num_heads, channels_half2, h_im, w_im, m_col, c_col_half2);
+          // 一次乘加，同时更新 2 个 channel！
+          col.x += val.x * weight;
+          col.y += val.y * weight;
+        }
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+      }
+    }
+    // 最后把高精度的 float2 打包回 half2 塞入显存
+    *data_col_ptr = __float22half2_rn(col);
+  }
+}
+
+// ============================================================================
+// 启动 Launcher
+// ============================================================================
+void ms_deformable_im2col_cuda_half2(hipStream_t stream,
+                              const __half* data_value,
+                              const int64_t* data_spatial_shapes, 
+                              const int64_t* data_level_start_index, 
+                              const __half* data_sampling_loc,
+                              const __half* data_attn_weight,
+                              const int batch_size,
+                              const int spatial_size, 
+                              const int num_heads, 
+                              const int channels, 
+                              const int num_levels, 
+                              const int num_query,
+                              const int num_point,
+                              __half* data_col)
+{
+  // 【绝杀】总线程数直接砍半，极大缓解 DCU 的线程调度压力
+  const int num_kernels = batch_size * num_query * num_heads * (channels / 2);
+  const int num_threads = CUDA_NUM_THREADS;
+  
+  // 巧妙地将 __half 指针强制转换为 half2* 传入，实现合并读写
+  ms_deformable_im2col_gpu_kernel_half2<<<GET_BLOCKS(num_kernels, num_threads), num_threads, 0, stream>>>(
+      num_kernels, (const half2*)data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, 
+      batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, (half2*)data_col);
+}
\ No newline at end of file
--- a/requirements.txt
+++ b/requirements.txt
+# torch
+# torchvision
+transformers
+addict
+yapf
+timm
+# numpy
+opencv-python
+supervision>=0.22.0
+pycocotools
\ No newline at end of file
--- a/resnet/infer_migraphx.py
+++ b/resnet/infer_migraphx.py
+# -*- coding: utf-8 -*-
+import cv2
+import numpy as np
+import migraphx
+import time
+import os
+
+
+def ReadImage(pathOfImage, inputShape):
+    """
+    读取并预处理图像，转换为模型输入要求的NCHW格式
+    """
+    srcImage = cv2.imread(pathOfImage, cv2.IMREAD_COLOR)
+    if srcImage is None:
+        raise ValueError(f"无法读取图像文件: {pathOfImage}")
+    
+    resizedImage = cv2.resize(srcImage, (inputShape[3], inputShape[2]))
+    resizedImage_Float = resizedImage.astype("float32")
+    srcImage_CHW = np.transpose(resizedImage_Float, (2, 0, 1))
+    
+    mean = np.array([127.5, 127.5, 127.5])
+    scale = np.array([0.0078125, 0.0078125, 0.0078125])
+    
+    inputData = np.zeros(inputShape).astype("float32")
+    for i in range(srcImage_CHW.shape[0]):
+        inputData[0, i, :, :] = (srcImage_CHW[i, :, :] - mean[i]) * scale[i]
+        
+    for i in range(inputData.shape[0]):
+        if i != 0:
+            inputData[i, :, :, :] = inputData[0, :, :, :]
+    
+    return inputData
+
+
+if __name__ == '__main__':
+    # ====================== 模型路径配置 ======================
+    onnx_path = "ResNet50.onnx"
+    mxr_path = "ResNet50_gpu.mxr"
+
+    # ====================== MIGraphX 加载 / 编译 / 保存 mxr ======================
+    print("🔍 加载ONNX模型")
+    
+    if os.path.exists(mxr_path):
+        # ✅ 正确加载 mxr
+        model = migraphx.load(mxr_path)
+        print("✅ 从缓存加载编译好的模型 (mxr)")
+    else:
+        # ✅ 正确编译 + 保存 mxr（修复了 save 报错）
+        model = migraphx.parse_onnx(onnx_path)
+        model.compile(migraphx.get_target("gpu"), device_id=0)
+        migraphx.save(model, mxr_path)
+        print("✅ 模型编译完成，并已保存为 mxr")
+
+    print(f"✅ 模型加载完成 - 当前执行引擎: ['ROCMExecutionProvider', 'CPUExecutionProvider'] (MIGraphX GPU)")
+
+    # ====================== 获取输入输出信息 ======================
+    input_name = list(model.get_inputs().keys())[0]
+    input_shape = model.get_inputs()[input_name].lens()
+    output_name = list(model.get_outputs().keys())[0]
+
+    print(f"模型输入名称：{input_name}, 输入形状：{input_shape}")
+    print(f"模型输出名称：{output_name}")
+
+    # ====================== 图像预处理 ======================
+    pathOfImage = "../images/in/ImageNet_01.jpg"
+    image = ReadImage(pathOfImage, input_shape)
+
+    # ====================== 预热 3 次 ======================
+    for i in range(3):
+        outputs = model.run({input_name: image})
+
+    # ====================== 推理测速 10 次 ======================
+    for i in range(10):
+        start_time = time.time()
+        outputs = model.run({input_name: image})
+        end_time = time.time()
+        print(f"推理时间: {(end_time - start_time) * 1000:.2f} ms")
\ No newline at end of file
--- a/resnet/infer_migraphx_device.py
+++ b/resnet/infer_migraphx_device.py
+# -*- coding: utf-8 -*-
+import cv2
+import numpy as np
+import migraphx
+import time
+import os
+
+
+def ReadImage(pathOfImage, inputShape):
+    """
+    读取并预处理图像，转换为模型输入要求的NCHW格式
+    """
+    srcImage = cv2.imread(pathOfImage, cv2.IMREAD_COLOR)
+    if srcImage is None:
+        raise ValueError(f"无法读取图像文件: {pathOfImage}")
+    
+    resizedImage = cv2.resize(srcImage, (inputShape[3], inputShape[2]))
+    resizedImage_Float = resizedImage.astype("float32")
+    srcImage_CHW = np.transpose(resizedImage_Float, (2, 0, 1))
+    
+    mean = np.array([127.5, 127.5, 127.5])
+    scale = np.array([0.0078125, 0.0078125, 0.0078125])
+    
+    inputData = np.zeros(inputShape).astype("float32")
+    for i in range(srcImage_CHW.shape[0]):
+        inputData[0, i, :, :] = (srcImage_CHW[i, :, :] - mean[i]) * scale[i]
+        
+    for i in range(inputData.shape[0]):
+        if i != 0:
+            inputData[i, :, :, :] = inputData[0, :, :, :]
+    
+    return inputData
+
+def AllocateOutputMemory(model):
+    outputData={}
+    for key in model.get_outputs().keys():
+        outputData[key] = migraphx.allocate_gpu(s=model.get_outputs()[key])
+    return outputData
+
+
+if __name__ == '__main__':
+    # ====================== 模型路径配置 ======================
+    onnx_path = "ResNet50.onnx"
+    mxr_path = "ResNet50_gpu.mxr"
+
+    # ====================== MIGraphX 加载 / 编译 / 保存 mxr ======================
+    print("🔍 加载ONNX模型")
+    
+    if os.path.exists(mxr_path):
+        # ✅ 正确加载 mxr
+        model = migraphx.load(mxr_path)
+        print("✅ 从缓存加载编译好的模型 (mxr)")
+    else:
+        # ✅ 正确编译 + 保存 mxr（修复了 save 报错）
+        model = migraphx.parse_onnx(onnx_path)
+        model.compile(migraphx.get_target("gpu"), offload_copy=False, device_id=0)
+        # 👇 这里是正确写法！不是 model.save()
+        migraphx.save(model, mxr_path)
+        print("✅ 模型编译完成，并已保存为 mxr")
+
+    print(f"✅ 模型加载完成 - 当前执行引擎: ['ROCMExecutionProvider', 'CPUExecutionProvider'] (MIGraphX GPU)")
+
+    # ====================== 获取输入输出信息 ======================
+    input_name = list(model.get_inputs().keys())[0]
+    input_shape = model.get_inputs()[input_name].lens()
+    output_name = list(model.get_outputs().keys())[0]
+
+    print(f"模型输入名称：{input_name}, 输入形状：{input_shape}")
+    print(f"模型输出名称：{output_name}")
+
+    # 为输出节点分配device内存，用于保存输出数据
+    modelData=AllocateOutputMemory(model)
+
+    # ====================== 图像预处理 ======================
+    pathOfImage = "../images/in/ImageNet_01.jpg"
+    image = ReadImage(pathOfImage, input_shape)
+
+    # 将输入数据转换为device数据作为输入数据
+    modelData[input_name]=migraphx.to_gpu(migraphx.argument(image))
+
+    # ====================== 预热 3 次 ======================
+    for i in range(3):
+        outputs = model.run({input_name: image})
+
+    # ====================== 推理测速 10 次 ======================
+    for i in range(10):
+        start_time = time.time()
+        outputs = model.run({input_name: image})
+        end_time = time.time()
+        print(f"推理时间: {(end_time - start_time) * 1000:.2f} ms")
\ No newline at end of file
--- a/resnet/infer_ort.py
+++ b/resnet/infer_ort.py
+# -*- coding: utf-8 -*-
+import cv2
+import numpy as np
+import migraphx
+
+
+def ReadImage(pathOfImage, inputShape):
+    """
+    读取并预处理图像，转换为模型输入要求的NCHW格式
+    
+    Args:
+        pathOfImage: 图像文件路径
+        inputShape: 模型输入形状 (N, C, H, W)
+    
+    Returns:
+        预处理后的图像数据，NCHW格式，float32类型
+    """
+    # 读取彩色图像
+    srcImage = cv2.imread(pathOfImage, cv2.IMREAD_COLOR)
+    if srcImage is None:
+        raise ValueError(f"无法读取图像文件: {pathOfImage}")
+    
+    # resize到模型要求的尺寸 (W, H)
+    resizedImage = cv2.resize(srcImage, (inputShape[3], inputShape[2]))
+    
+    # 转换为float32类型
+    resizedImage_Float = resizedImage.astype("float32")
+    
+    # HWC -> CHW
+    srcImage_CHW = np.transpose(resizedImage_Float, (2, 0, 1))
+    
+    # 预处理：减均值，乘缩放因子
+    mean = np.array([127.5, 127.5, 127.5])
+    scale = np.array([0.0078125, 0.0078125, 0.0078125])
+    
+    # 创建NCHW格式的输入数据
+    inputData = np.zeros(inputShape).astype("float32")
+    
+    # 对每个通道进行预处理
+    for i in range(srcImage_CHW.shape[0]):
+        inputData[0, i, :, :] = (srcImage_CHW[i, :, :] - mean[i]) * scale[i]
+    
+    # 如果batch维度大于1，复制第一份数据填充（仅用于示例）
+    for i in range(inputData.shape[0]):
+        if i != 0:
+            inputData[i, :, :, :] = inputData[0, :, :, :]
+    
+    return inputData
+
+
+if __name__ == '__main__':
+    # ====================== 1. 加载ONNX模型 ======================
+    # try:
+    #     model = migraphx.parse_onnx("ResNet50.onnx")
+    # except Exception as e:
+    #     raise RuntimeError(f"加载模型失败: {e}")
+    # 加载ONNX模型（启用优化）
+    print("🔍 加载ONNX模型")
+    import onnxruntime as ort
+    model_path = 'ResNet50.onnx'
+    sess_options = ort.SessionOptions()
+    sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL  # 启用所有图优化
+    sess_options.log_severity_level = 3  # 减少日志输出
+    # sess_options.enable_profiling = True # 启用性能分析
+    ort_session = ort.InferenceSession(model_path, 
+                                       sess_options=sess_options, 
+                                       providers=['CPUExecutionProvider, ROCMExecutionProvider'])
+
+    # 查看当前执行引擎
+    current_provider = ort_session.get_providers()
+    print(f"✅ 模型加载完成 - 当前执行引擎: {current_provider}")
+
+    input_name = ort_session.get_inputs()[0].name
+    input_shape = ort_session.get_inputs()[0].shape
+    output_name = ort_session.get_outputs()[0].name
+    print(f"模型输入名称：{input_name}, 输入形状：{input_shape}")
+    print(f"模型输出名称：{output_name}")
+    # ====================== 4. 图像预处理 ======================
+    pathOfImage = "../images/in/ImageNet_01.jpg"
+    try:
+        image = ReadImage(pathOfImage, input_shape)
+    except Exception as e:
+        raise RuntimeError(f"图像预处理失败: {e}")
+    
+    # outputs = ort_session.run({input_name: image})
+    for i in range(3):  # 示例：运行10次
+        outputs = ort_session.run([output_name], {input_name: image})
+
+    import time
+    for i in range(10):  # 示例：运行10次
+        start_time = time.time()
+        outputs = ort_session.run([output_name], {input_name: image})
+        end_time = time.time()
+        print(f"推理时间: {(end_time - start_time) * 1000:.2f} ms")
\ No newline at end of file
--- a/resnet/onnx_export.py
+++ b/resnet/onnx_export.py
+import torch
+import torchvision
+
+# Pytorch模型文件
+pathOfPytorchModel = "resnet50-19c8e357.pth"
+# 创建ResNet50模型
+net = torchvision.models.resnet50(pretrained=False)
+# 定义输入
+input = torch.randn(32,3,224,224)
+# 生成的ONNX模型的路径
+pathOfONNX = "ResNet50.onnx"
+net.load_state_dict(torch.load(pathOfPytorchModel))
+net.eval()
+# 导出ONNX模型
+torch.onnx.export(net,input,pathOfONNX,input_names = ["input"])
--- a/resnet/profile.json_2026-04-03_11-42-27.json
+++ b/resnet/profile.json_2026-04-03_11-42-27.json
+[
+{"cat" : "Session","pid" :1773812,"tid" :1773812,"dur" :91000,"ts" :20,"ph" : "X","name" :"model_loading_uri","args" : {}},
+{"cat" : "Session","pid" :1773812,"tid" :1773812,"dur" :4754984,"ts" :91208,"ph" : "X","name" :"session_initialization","args" : {}}
+]
--- a/resnet/profile.json_2026-04-03_11-43-38.json
+++ b/resnet/profile.json_2026-04-03_11-43-38.json
+[
+{"cat" : "Session","pid" :1774715,"tid" :1774715,"dur" :73864,"ts" :6,"ph" : "X","name" :"model_loading_uri","args" : {}},
+{"cat" : "Session","pid" :1774715,"tid" :1774715,"dur" :4308806,"ts" :73985,"ph" : "X","name" :"session_initialization","args" : {}}
+]