Commit 34e4011b authored by zk's avatar zk
Browse files

首次提交

parents
Pipeline #3503 failed with stages
in 0 seconds
import onnx
import onnxoptimizer as optimizer
def optimize_onnx_model(input_onnx_path, output_onnx_path, fp16=False):
"""
优化ONNX模型,支持基础优化+可选FP16量化
:param input_onnx_path: 原始ONNX文件路径
:param output_onnx_path: 优化后保存路径
:param fp16: 是否开启FP16量化(需GPU支持,速度提升更明显)
"""
# 1. 加载原始模型
model = onnx.load(input_onnx_path)
# 2. 定义优化策略(按需增减,下面是通用高效的组合)
optimize_passes = [
"eliminate_deadend", # 删除无输出的节点
"eliminate_identity", # 移除Identity算子
"eliminate_unused_initializer",# 删除未使用的初始化参数
"fuse_bn_into_conv", # 将BN融合到Conv中(核心优化)
"fuse_consecutive_concats" # 融合连续的Concat算子
]
# 3. 执行基础优化
optimized_model = optimizer.optimize(model, optimize_passes)
# 4. 可选:FP16量化(GPU推理提速显著,精度损失可控)
# if fp16:
# from onnxconverter_common import float16
# optimized_model = float16.convert_float_to_float16(
# optimized_model,
# keep_io_types=True, # 保持输入输出为FP32,避免数据类型兼容问题
# disable_shape_infer=False
# )
# 5. 检查优化后模型合法性并保存
onnx.checker.check_model(optimized_model)
onnx.save(optimized_model, output_onnx_path)
print(f"优化后的模型已保存至:{output_onnx_path}")
# 调用示例
if __name__ == "__main__":
# 替换为你的ONNX文件路径
input_onnx = "weights/ground_simplified.onnx"
output_onnx = "weights/ground_simoptim.onnx"
# 建议先试False(FP32),GPU环境再开True(FP16)
optimize_onnx_model(input_onnx, output_onnx, fp16=False)
\ No newline at end of file
cmake_minimum_required(VERSION 3.14)
project(GroundingDinoORTPlugin)
# 设置 C++ 标准
set(CMAKE_CXX_STANDARD 14)
# 告诉 CMake 使用 hipcc 编译器
set(CMAKE_CXX_COMPILER "hipcc")
set(CMAKE_C_COMPILER "hipcc")
# 1. 寻找 ONNX Runtime 头文件
# 请替换为你环境中实际的 onnxruntime_cxx_api.h 所在路径
set(ONNXRUNTIME_INCLUDE_DIR "/opt/dtk-25.04.2/onnxruntime/include")
include_directories(${ONNXRUNTIME_INCLUDE_DIR})
include_directories(.) # 把当前目录加进去,方便找到 .cuh
# 2. 编译我们的共享库 (.so)
add_library(ms_deform_attn_ort SHARED
ms_deform_attn_ort.hip
)
# =========================================================================
# 【关键修复】告诉 CMake 把 .hip 文件当作 C++ 语言来编译和链接
# =========================================================================
set_source_files_properties(ms_deform_attn_ort.hip PROPERTIES LANGUAGE CXX)
set_target_properties(ms_deform_attn_ort PROPERTIES LINKER_LANGUAGE CXX)
# 编译优化
# target_compile_options(ms_deform_attn_ort PRIVATE -O3 -fPIC)
target_compile_options(ms_deform_attn_ort PRIVATE -O3 -fPIC --gpu-max-threads-per-block=1024)
\ No newline at end of file
// 【关键修复 1】定义这个宏,关闭 ORT 自动寻找 OrtGetApiBase 的行为
#define ORT_API_MANUAL_INIT
#include <onnxruntime_c_api.h>
#include <onnxruntime_cxx_api.h>
#include <hip/hip_runtime.h>
#include <vector>
#include <cmath>
#include <algorithm>
// 引入核心计算头文件
#include "ms_deform_im2col_cuda_hip.cuh"
// ============================================================================
// 1. 定义 Kernel
// ============================================================================
struct MsDeformAttnKernel {
const OrtApi* api_;
int im2col_step_;
MsDeformAttnKernel(const OrtApi* api, const OrtKernelInfo* info) : api_(api) {
int64_t step = 64;
OrtStatus* status = api_->KernelInfoGetAttribute_int64(info, "im2col_step_i", &step);
if (status != nullptr) {
api_->ReleaseStatus(status);
im2col_step_ = 64; // 【修复1】加上兜底赋值
} else {
im2col_step_ = static_cast<int>(step);
}
im2col_step_ = std::max(1, im2col_step_);
}
void CheckStatus(OrtStatus* status) {
if (status != nullptr) {
api_->ReleaseStatus(status);
}
}
std::vector<int64_t> GetTensorDims(const OrtValue* tensor) {
OrtTensorTypeAndShapeInfo* info;
CheckStatus(api_->GetTensorTypeAndShape(tensor, &info));
size_t dim_count;
CheckStatus(api_->GetDimensionsCount(info, &dim_count));
std::vector<int64_t> dims(dim_count);
CheckStatus(api_->GetDimensions(info, dims.data(), dim_count));
api_->ReleaseTensorTypeAndShapeInfo(info);
return dims;
}
void Compute(OrtKernelContext* context) {
OrtStatusPtr status = ComputeV2(context);
if (status != nullptr) {
api_->ReleaseStatus(status);
}
}
OrtStatusPtr ComputeV2(OrtKernelContext* context) {
const OrtValue* value_tensor;
const OrtValue* spatial_shapes_tensor;
const OrtValue* level_start_index_tensor;
const OrtValue* sampling_loc_tensor;
const OrtValue* attn_weight_tensor;
CheckStatus(api_->KernelContext_GetInput(context, 0, &value_tensor));
CheckStatus(api_->KernelContext_GetInput(context, 1, &spatial_shapes_tensor));
CheckStatus(api_->KernelContext_GetInput(context, 2, &level_start_index_tensor));
CheckStatus(api_->KernelContext_GetInput(context, 3, &sampling_loc_tensor));
CheckStatus(api_->KernelContext_GetInput(context, 4, &attn_weight_tensor));
void* value_ptr;
void* spatial_shapes_ptr;
void* level_start_index_ptr;
void* sampling_loc_ptr;
void* attn_weight_ptr;
CheckStatus(api_->GetTensorMutableData(const_cast<OrtValue*>(value_tensor), &value_ptr));
CheckStatus(api_->GetTensorMutableData(const_cast<OrtValue*>(spatial_shapes_tensor), &spatial_shapes_ptr));
CheckStatus(api_->GetTensorMutableData(const_cast<OrtValue*>(level_start_index_tensor), &level_start_index_ptr));
CheckStatus(api_->GetTensorMutableData(const_cast<OrtValue*>(sampling_loc_tensor), &sampling_loc_ptr));
CheckStatus(api_->GetTensorMutableData(const_cast<OrtValue*>(attn_weight_tensor), &attn_weight_ptr));
const float* value_data = reinterpret_cast<const float*>(value_ptr);
const int64_t* spatial_shapes_data = reinterpret_cast<const int64_t*>(spatial_shapes_ptr);
const int64_t* level_start_index_data = reinterpret_cast<const int64_t*>(level_start_index_ptr);
const float* sampling_loc_data = reinterpret_cast<const float*>(sampling_loc_ptr);
const float* attn_weight_data = reinterpret_cast<const float*>(attn_weight_ptr);
auto value_dims = GetTensorDims(value_tensor);
auto spatial_shapes_dims = GetTensorDims(spatial_shapes_tensor);
auto sampling_loc_dims = GetTensorDims(sampling_loc_tensor);
const int batch = value_dims[0];
const int spatial_size = value_dims[1];
const int num_heads = value_dims[2];
const int channels = value_dims[3];
const int num_levels = spatial_shapes_dims[0];
const int num_query = sampling_loc_dims[1];
const int num_point = sampling_loc_dims[4];
std::vector<int64_t> output_dims = {batch, num_query, num_heads * channels};
OrtValue* output_tensor;
CheckStatus(api_->KernelContext_GetOutput(context, 0, output_dims.data(), output_dims.size(), &output_tensor));
void* output_ptr;
CheckStatus(api_->GetTensorMutableData(output_tensor, &output_ptr));
float* output_data = reinterpret_cast<float*>(output_ptr);
void* stream_ptr;
CheckStatus(api_->KernelContext_GetGPUComputeStream(context, &stream_ptr));
hipStream_t stream = reinterpret_cast<hipStream_t>(stream_ptr);
// const int im2col_step_real = std::min(batch, im2col_step_);
const int im2col_step_real = std::max(1, std::min(batch, im2col_step_));
const int batch_n = im2col_step_real;
auto per_value_size = spatial_size * num_heads * channels;
auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
auto per_output_size = batch_n * num_query * num_heads * channels;
for (int n = 0; n < batch / im2col_step_real; ++n) {
ms_deformable_im2col_cuda(
stream,
value_data + n * im2col_step_real * per_value_size,
spatial_shapes_data,
level_start_index_data,
sampling_loc_data + n * im2col_step_real * per_sample_loc_size,
attn_weight_data + n * im2col_step_real * per_attn_weight_size,
batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
output_data + n * per_output_size
);
}
return nullptr;
}
};
// ============================================================================
// 2. 算子接口封装
// ============================================================================
struct MsDeformAttnCustomOp : Ort::CustomOpBase<MsDeformAttnCustomOp, MsDeformAttnKernel> {
void* CreateKernel(const OrtApi& api, const OrtKernelInfo* info) const {
return new MsDeformAttnKernel(&api, info);
}
OrtStatusPtr CreateKernelV2(const OrtApi& api, const OrtKernelInfo* info, void** op_kernel) const {
*op_kernel = new MsDeformAttnKernel(&api, info);
return nullptr;
}
const char* GetName() const { return "ms_deform_attn"; }
const char* GetExecutionProviderType() const { return "ROCMExecutionProvider"; }
size_t GetInputTypeCount() const { return 5; }
ONNXTensorElementDataType GetInputType(size_t index) const {
if (index == 0 || index == 3 || index == 4) return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
}
size_t GetOutputTypeCount() const { return 1; }
ONNXTensorElementDataType GetOutputType(size_t index) const {
return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
}
};
// ============================================================================
// 3. 注册入口
// ============================================================================
MsDeformAttnCustomOp ms_deform_attn_op;
extern "C" OrtStatus* RegisterCustomOps(OrtSessionOptions* options, const OrtApiBase* api) {
// 【关键修复 2】手动初始化 C++ API,把 Python 传来的真实 API 指针交给 ORT
Ort::InitApi(api->GetApi(ORT_API_VERSION));
OrtCustomOpDomain* domain = nullptr;
const OrtApi* ortApi = api->GetApi(ORT_API_VERSION);
if (auto status = ortApi->CreateCustomOpDomain("custom", &domain)) { return status; }
if (auto status = ortApi->CustomOpDomain_Add(domain, &ms_deform_attn_op)) { return status; }
return ortApi->AddCustomOpDomain(options, domain);
}
\ No newline at end of file
// !!! This is a file automatically generated by hipify!!!
// #include <ATen/dtk_macros.h>
#include "hip/hip_runtime.h"
/*!
**************************************************************************
* Deformable DETR
* Copyright (c) 2020 SenseTime. All Rights Reserved.
* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
**************************************************************************
* Modified from DCN (https://github.com/msracver/Deformable-ConvNets)
* Copyright (c) 2018 Microsoft
**************************************************************************
*/
#include <cstdio>
#include <algorithm>
#include <cstring>
// #include <ATen/ATen.h>
// #include <ATen/hip/HIPContext.h>
// #include <THH/THHAtomics.cuh>
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
i < (n); \
i += blockDim.x * gridDim.x)
const int CUDA_NUM_THREADS = 1024;
inline int GET_BLOCKS(const int N, const int num_threads)
{
return (N + num_threads - 1) / num_threads;
}
template <typename scalar_t>
__device__ scalar_t ms_deform_attn_im2col_bilinear(const scalar_t* &bottom_data,
const int &height, const int &width, const int &nheads, const int &channels,
const scalar_t &h, const scalar_t &w, const int &m, const int &c)
{
const int h_low = floor(h);
const int w_low = floor(w);
const int h_high = h_low + 1;
const int w_high = w_low + 1;
const scalar_t lh = h - h_low;
const scalar_t lw = w - w_low;
const scalar_t hh = 1 - lh, hw = 1 - lw;
const int w_stride = nheads * channels;
const int h_stride = width * w_stride;
const int h_low_ptr_offset = h_low * h_stride;
const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
const int w_low_ptr_offset = w_low * w_stride;
const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
const int base_ptr = m * channels + c;
scalar_t v1 = 0;
if (h_low >= 0 && w_low >= 0)
{
const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
v1 = bottom_data[ptr1];
}
scalar_t v2 = 0;
if (h_low >= 0 && w_high <= width - 1)
{
const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
v2 = bottom_data[ptr2];
}
scalar_t v3 = 0;
if (h_high <= height - 1 && w_low >= 0)
{
const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
v3 = bottom_data[ptr3];
}
scalar_t v4 = 0;
if (h_high <= height - 1 && w_high <= width - 1)
{
const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
v4 = bottom_data[ptr4];
}
const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
return val;
}
template <typename scalar_t>
__device__ void ms_deform_attn_col2im_bilinear(const scalar_t* &bottom_data,
const int &height, const int &width, const int &nheads, const int &channels,
const scalar_t &h, const scalar_t &w, const int &m, const int &c,
const scalar_t &top_grad,
const scalar_t &attn_weight,
scalar_t* &grad_value,
scalar_t* grad_sampling_loc,
scalar_t* grad_attn_weight)
{
const int h_low = floor(h);
const int w_low = floor(w);
const int h_high = h_low + 1;
const int w_high = w_low + 1;
const scalar_t lh = h - h_low;
const scalar_t lw = w - w_low;
const scalar_t hh = 1 - lh, hw = 1 - lw;
const int w_stride = nheads * channels;
const int h_stride = width * w_stride;
const int h_low_ptr_offset = h_low * h_stride;
const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
const int w_low_ptr_offset = w_low * w_stride;
const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
const int base_ptr = m * channels + c;
const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
const scalar_t top_grad_value = top_grad * attn_weight;
scalar_t grad_h_weight = 0, grad_w_weight = 0;
scalar_t v1 = 0;
if (h_low >= 0 && w_low >= 0)
{
const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
v1 = bottom_data[ptr1];
grad_h_weight -= hw * v1;
grad_w_weight -= hh * v1;
atomicAdd(grad_value+ptr1, w1*top_grad_value);
}
scalar_t v2 = 0;
if (h_low >= 0 && w_high <= width - 1)
{
const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
v2 = bottom_data[ptr2];
grad_h_weight -= lw * v2;
grad_w_weight += hh * v2;
atomicAdd(grad_value+ptr2, w2*top_grad_value);
}
scalar_t v3 = 0;
if (h_high <= height - 1 && w_low >= 0)
{
const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
v3 = bottom_data[ptr3];
grad_h_weight += hw * v3;
grad_w_weight -= lh * v3;
atomicAdd(grad_value+ptr3, w3*top_grad_value);
}
scalar_t v4 = 0;
if (h_high <= height - 1 && w_high <= width - 1)
{
const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
v4 = bottom_data[ptr4];
grad_h_weight += lw * v4;
grad_w_weight += lh * v4;
atomicAdd(grad_value+ptr4, w4*top_grad_value);
}
const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
*grad_attn_weight = top_grad * val;
*grad_sampling_loc = width * grad_w_weight * top_grad_value;
*(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value;
}
template <typename scalar_t>
__device__ void ms_deform_attn_col2im_bilinear_gm(const scalar_t* &bottom_data,
const int &height, const int &width, const int &nheads, const int &channels,
const scalar_t &h, const scalar_t &w, const int &m, const int &c,
const scalar_t &top_grad,
const scalar_t &attn_weight,
scalar_t* &grad_value,
scalar_t* grad_sampling_loc,
scalar_t* grad_attn_weight)
{
const int h_low = floor(h);
const int w_low = floor(w);
const int h_high = h_low + 1;
const int w_high = w_low + 1;
const scalar_t lh = h - h_low;
const scalar_t lw = w - w_low;
const scalar_t hh = 1 - lh, hw = 1 - lw;
const int w_stride = nheads * channels;
const int h_stride = width * w_stride;
const int h_low_ptr_offset = h_low * h_stride;
const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
const int w_low_ptr_offset = w_low * w_stride;
const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
const int base_ptr = m * channels + c;
const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
const scalar_t top_grad_value = top_grad * attn_weight;
scalar_t grad_h_weight = 0, grad_w_weight = 0;
scalar_t v1 = 0;
if (h_low >= 0 && w_low >= 0)
{
const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
v1 = bottom_data[ptr1];
grad_h_weight -= hw * v1;
grad_w_weight -= hh * v1;
atomicAdd(grad_value+ptr1, w1*top_grad_value);
}
scalar_t v2 = 0;
if (h_low >= 0 && w_high <= width - 1)
{
const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
v2 = bottom_data[ptr2];
grad_h_weight -= lw * v2;
grad_w_weight += hh * v2;
atomicAdd(grad_value+ptr2, w2*top_grad_value);
}
scalar_t v3 = 0;
if (h_high <= height - 1 && w_low >= 0)
{
const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
v3 = bottom_data[ptr3];
grad_h_weight += hw * v3;
grad_w_weight -= lh * v3;
atomicAdd(grad_value+ptr3, w3*top_grad_value);
}
scalar_t v4 = 0;
if (h_high <= height - 1 && w_high <= width - 1)
{
const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
v4 = bottom_data[ptr4];
grad_h_weight += lw * v4;
grad_w_weight += lh * v4;
atomicAdd(grad_value+ptr4, w4*top_grad_value);
}
const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
atomicAdd(grad_attn_weight, top_grad * val);
atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value);
atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value);
}
template <typename scalar_t>
__global__ void ms_deformable_im2col_gpu_kernel(const int n,
const scalar_t *data_value,
const int64_t *data_spatial_shapes,
const int64_t *data_level_start_index,
const scalar_t *data_sampling_loc,
const scalar_t *data_attn_weight,
const int batch_size,
const int spatial_size,
const int num_heads,
const int channels,
const int num_levels,
const int num_query,
const int num_point,
scalar_t *data_col)
{
CUDA_KERNEL_LOOP(index, n)
{
int _temp = index;
const int c_col = _temp % channels;
_temp /= channels;
const int sampling_index = _temp;
const int m_col = _temp % num_heads;
_temp /= num_heads;
const int q_col = _temp % num_query;
_temp /= num_query;
const int b_col = _temp;
scalar_t *data_col_ptr = data_col + index;
int data_weight_ptr = sampling_index * num_levels * num_point;
int data_loc_w_ptr = data_weight_ptr << 1;
const int qid_stride = num_heads * channels;
const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
scalar_t col = 0;
for (int l_col=0; l_col < num_levels; ++l_col)
{
const int level_start_id = data_level_start_index[l_col];
const int spatial_h_ptr = l_col << 1;
const int spatial_h = data_spatial_shapes[spatial_h_ptr];
const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
const scalar_t *data_value_ptr = data_value + (data_value_ptr_init_offset + level_start_id * qid_stride);
for (int p_col=0; p_col < num_point; ++p_col)
{
const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
const scalar_t weight = data_attn_weight[data_weight_ptr];
const scalar_t h_im = loc_h * spatial_h - 0.5;
const scalar_t w_im = loc_w * spatial_w - 0.5;
if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
{
col += ms_deform_attn_im2col_bilinear(data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col) * weight;
}
data_weight_ptr += 1;
data_loc_w_ptr += 2;
}
}
*data_col_ptr = col;
}
}
template <typename scalar_t, unsigned int blockSize>
__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(const int n,
const scalar_t *grad_col,
const scalar_t *data_value,
const int64_t *data_spatial_shapes,
const int64_t *data_level_start_index,
const scalar_t *data_sampling_loc,
const scalar_t *data_attn_weight,
const int batch_size,
const int spatial_size,
const int num_heads,
const int channels,
const int num_levels,
const int num_query,
const int num_point,
scalar_t *grad_value,
scalar_t *grad_sampling_loc,
scalar_t *grad_attn_weight)
{
CUDA_KERNEL_LOOP(index, n)
{
__shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
__shared__ scalar_t cache_grad_attn_weight[blockSize];
unsigned int tid = threadIdx.x;
int _temp = index;
const int c_col = _temp % channels;
_temp /= channels;
const int sampling_index = _temp;
const int m_col = _temp % num_heads;
_temp /= num_heads;
const int q_col = _temp % num_query;
_temp /= num_query;
const int b_col = _temp;
const scalar_t top_grad = grad_col[index];
int data_weight_ptr = sampling_index * num_levels * num_point;
int data_loc_w_ptr = data_weight_ptr << 1;
const int grad_sampling_ptr = data_weight_ptr;
grad_sampling_loc += grad_sampling_ptr << 1;
grad_attn_weight += grad_sampling_ptr;
const int grad_weight_stride = 1;
const int grad_loc_stride = 2;
const int qid_stride = num_heads * channels;
const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
for (int l_col=0; l_col < num_levels; ++l_col)
{
const int level_start_id = data_level_start_index[l_col];
const int spatial_h_ptr = l_col << 1;
const int spatial_h = data_spatial_shapes[spatial_h_ptr];
const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
const scalar_t *data_value_ptr = data_value + value_ptr_offset;
scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
for (int p_col=0; p_col < num_point; ++p_col)
{
const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
const scalar_t weight = data_attn_weight[data_weight_ptr];
const scalar_t h_im = loc_h * spatial_h - 0.5;
const scalar_t w_im = loc_w * spatial_w - 0.5;
*(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
*(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
*(cache_grad_attn_weight+threadIdx.x)=0;
if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
{
ms_deform_attn_col2im_bilinear(
data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
top_grad, weight, grad_value_ptr,
cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
}
__syncthreads();
if (tid == 0)
{
scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0];
int sid=2;
for (unsigned int tid = 1; tid < blockSize; ++tid)
{
_grad_w += cache_grad_sampling_loc[sid];
_grad_h += cache_grad_sampling_loc[sid + 1];
_grad_a += cache_grad_attn_weight[tid];
sid += 2;
}
*grad_sampling_loc = _grad_w;
*(grad_sampling_loc + 1) = _grad_h;
*grad_attn_weight = _grad_a;
}
__syncthreads();
data_weight_ptr += 1;
data_loc_w_ptr += 2;
grad_attn_weight += grad_weight_stride;
grad_sampling_loc += grad_loc_stride;
}
}
}
}
template <typename scalar_t, unsigned int blockSize>
__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(const int n,
const scalar_t *grad_col,
const scalar_t *data_value,
const int64_t *data_spatial_shapes,
const int64_t *data_level_start_index,
const scalar_t *data_sampling_loc,
const scalar_t *data_attn_weight,
const int batch_size,
const int spatial_size,
const int num_heads,
const int channels,
const int num_levels,
const int num_query,
const int num_point,
scalar_t *grad_value,
scalar_t *grad_sampling_loc,
scalar_t *grad_attn_weight)
{
CUDA_KERNEL_LOOP(index, n)
{
__shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
__shared__ scalar_t cache_grad_attn_weight[blockSize];
unsigned int tid = threadIdx.x;
int _temp = index;
const int c_col = _temp % channels;
_temp /= channels;
const int sampling_index = _temp;
const int m_col = _temp % num_heads;
_temp /= num_heads;
const int q_col = _temp % num_query;
_temp /= num_query;
const int b_col = _temp;
const scalar_t top_grad = grad_col[index];
int data_weight_ptr = sampling_index * num_levels * num_point;
int data_loc_w_ptr = data_weight_ptr << 1;
const int grad_sampling_ptr = data_weight_ptr;
grad_sampling_loc += grad_sampling_ptr << 1;
grad_attn_weight += grad_sampling_ptr;
const int grad_weight_stride = 1;
const int grad_loc_stride = 2;
const int qid_stride = num_heads * channels;
const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
for (int l_col=0; l_col < num_levels; ++l_col)
{
const int level_start_id = data_level_start_index[l_col];
const int spatial_h_ptr = l_col << 1;
const int spatial_h = data_spatial_shapes[spatial_h_ptr];
const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
const scalar_t *data_value_ptr = data_value + value_ptr_offset;
scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
for (int p_col=0; p_col < num_point; ++p_col)
{
const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
const scalar_t weight = data_attn_weight[data_weight_ptr];
const scalar_t h_im = loc_h * spatial_h - 0.5;
const scalar_t w_im = loc_w * spatial_w - 0.5;
*(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
*(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
*(cache_grad_attn_weight+threadIdx.x)=0;
if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
{
ms_deform_attn_col2im_bilinear(
data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
top_grad, weight, grad_value_ptr,
cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
}
__syncthreads();
for (unsigned int s=blockSize/2; s>0; s>>=1)
{
if (tid < s) {
const unsigned int xid1 = tid << 1;
const unsigned int xid2 = (tid + s) << 1;
cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
}
__syncthreads();
}
if (tid == 0)
{
*grad_sampling_loc = cache_grad_sampling_loc[0];
*(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
*grad_attn_weight = cache_grad_attn_weight[0];
}
__syncthreads();
data_weight_ptr += 1;
data_loc_w_ptr += 2;
grad_attn_weight += grad_weight_stride;
grad_sampling_loc += grad_loc_stride;
}
}
}
}
template <typename scalar_t>
__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(const int n,
const scalar_t *grad_col,
const scalar_t *data_value,
const int64_t *data_spatial_shapes,
const int64_t *data_level_start_index,
const scalar_t *data_sampling_loc,
const scalar_t *data_attn_weight,
const int batch_size,
const int spatial_size,
const int num_heads,
const int channels,
const int num_levels,
const int num_query,
const int num_point,
scalar_t *grad_value,
scalar_t *grad_sampling_loc,
scalar_t *grad_attn_weight)
{
CUDA_KERNEL_LOOP(index, n)
{
extern __shared__ int _s[];
scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
unsigned int tid = threadIdx.x;
int _temp = index;
const int c_col = _temp % channels;
_temp /= channels;
const int sampling_index = _temp;
const int m_col = _temp % num_heads;
_temp /= num_heads;
const int q_col = _temp % num_query;
_temp /= num_query;
const int b_col = _temp;
const scalar_t top_grad = grad_col[index];
int data_weight_ptr = sampling_index * num_levels * num_point;
int data_loc_w_ptr = data_weight_ptr << 1;
const int grad_sampling_ptr = data_weight_ptr;
grad_sampling_loc += grad_sampling_ptr << 1;
grad_attn_weight += grad_sampling_ptr;
const int grad_weight_stride = 1;
const int grad_loc_stride = 2;
const int qid_stride = num_heads * channels;
const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
for (int l_col=0; l_col < num_levels; ++l_col)
{
const int level_start_id = data_level_start_index[l_col];
const int spatial_h_ptr = l_col << 1;
const int spatial_h = data_spatial_shapes[spatial_h_ptr];
const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
const scalar_t *data_value_ptr = data_value + value_ptr_offset;
scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
for (int p_col=0; p_col < num_point; ++p_col)
{
const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
const scalar_t weight = data_attn_weight[data_weight_ptr];
const scalar_t h_im = loc_h * spatial_h - 0.5;
const scalar_t w_im = loc_w * spatial_w - 0.5;
*(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
*(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
*(cache_grad_attn_weight+threadIdx.x)=0;
if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
{
ms_deform_attn_col2im_bilinear(
data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
top_grad, weight, grad_value_ptr,
cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
}
__syncthreads();
if (tid == 0)
{
scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0];
int sid=2;
for (unsigned int tid = 1; tid < blockDim.x; ++tid)
{
_grad_w += cache_grad_sampling_loc[sid];
_grad_h += cache_grad_sampling_loc[sid + 1];
_grad_a += cache_grad_attn_weight[tid];
sid += 2;
}
*grad_sampling_loc = _grad_w;
*(grad_sampling_loc + 1) = _grad_h;
*grad_attn_weight = _grad_a;
}
__syncthreads();
data_weight_ptr += 1;
data_loc_w_ptr += 2;
grad_attn_weight += grad_weight_stride;
grad_sampling_loc += grad_loc_stride;
}
}
}
}
template <typename scalar_t>
__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(const int n,
const scalar_t *grad_col,
const scalar_t *data_value,
const int64_t *data_spatial_shapes,
const int64_t *data_level_start_index,
const scalar_t *data_sampling_loc,
const scalar_t *data_attn_weight,
const int batch_size,
const int spatial_size,
const int num_heads,
const int channels,
const int num_levels,
const int num_query,
const int num_point,
scalar_t *grad_value,
scalar_t *grad_sampling_loc,
scalar_t *grad_attn_weight)
{
CUDA_KERNEL_LOOP(index, n)
{
extern __shared__ int _s[];
scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
unsigned int tid = threadIdx.x;
int _temp = index;
const int c_col = _temp % channels;
_temp /= channels;
const int sampling_index = _temp;
const int m_col = _temp % num_heads;
_temp /= num_heads;
const int q_col = _temp % num_query;
_temp /= num_query;
const int b_col = _temp;
const scalar_t top_grad = grad_col[index];
int data_weight_ptr = sampling_index * num_levels * num_point;
int data_loc_w_ptr = data_weight_ptr << 1;
const int grad_sampling_ptr = data_weight_ptr;
grad_sampling_loc += grad_sampling_ptr << 1;
grad_attn_weight += grad_sampling_ptr;
const int grad_weight_stride = 1;
const int grad_loc_stride = 2;
const int qid_stride = num_heads * channels;
const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
for (int l_col=0; l_col < num_levels; ++l_col)
{
const int level_start_id = data_level_start_index[l_col];
const int spatial_h_ptr = l_col << 1;
const int spatial_h = data_spatial_shapes[spatial_h_ptr];
const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
const scalar_t *data_value_ptr = data_value + value_ptr_offset;
scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
for (int p_col=0; p_col < num_point; ++p_col)
{
const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
const scalar_t weight = data_attn_weight[data_weight_ptr];
const scalar_t h_im = loc_h * spatial_h - 0.5;
const scalar_t w_im = loc_w * spatial_w - 0.5;
*(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
*(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
*(cache_grad_attn_weight+threadIdx.x)=0;
if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
{
ms_deform_attn_col2im_bilinear(
data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
top_grad, weight, grad_value_ptr,
cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
}
__syncthreads();
for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1)
{
if (tid < s) {
const unsigned int xid1 = tid << 1;
const unsigned int xid2 = (tid + s) << 1;
cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
if (tid + (s << 1) < spre)
{
cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)];
cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)];
cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
}
}
__syncthreads();
}
if (tid == 0)
{
*grad_sampling_loc = cache_grad_sampling_loc[0];
*(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
*grad_attn_weight = cache_grad_attn_weight[0];
}
__syncthreads();
data_weight_ptr += 1;
data_loc_w_ptr += 2;
grad_attn_weight += grad_weight_stride;
grad_sampling_loc += grad_loc_stride;
}
}
}
}
template <typename scalar_t>
__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(const int n,
const scalar_t *grad_col,
const scalar_t *data_value,
const int64_t *data_spatial_shapes,
const int64_t *data_level_start_index,
const scalar_t *data_sampling_loc,
const scalar_t *data_attn_weight,
const int batch_size,
const int spatial_size,
const int num_heads,
const int channels,
const int num_levels,
const int num_query,
const int num_point,
scalar_t *grad_value,
scalar_t *grad_sampling_loc,
scalar_t *grad_attn_weight)
{
CUDA_KERNEL_LOOP(index, n)
{
extern __shared__ int _s[];
scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
unsigned int tid = threadIdx.x;
int _temp = index;
const int c_col = _temp % channels;
_temp /= channels;
const int sampling_index = _temp;
const int m_col = _temp % num_heads;
_temp /= num_heads;
const int q_col = _temp % num_query;
_temp /= num_query;
const int b_col = _temp;
const scalar_t top_grad = grad_col[index];
int data_weight_ptr = sampling_index * num_levels * num_point;
int data_loc_w_ptr = data_weight_ptr << 1;
const int grad_sampling_ptr = data_weight_ptr;
grad_sampling_loc += grad_sampling_ptr << 1;
grad_attn_weight += grad_sampling_ptr;
const int grad_weight_stride = 1;
const int grad_loc_stride = 2;
const int qid_stride = num_heads * channels;
const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
for (int l_col=0; l_col < num_levels; ++l_col)
{
const int level_start_id = data_level_start_index[l_col];
const int spatial_h_ptr = l_col << 1;
const int spatial_h = data_spatial_shapes[spatial_h_ptr];
const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
const scalar_t *data_value_ptr = data_value + value_ptr_offset;
scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
for (int p_col=0; p_col < num_point; ++p_col)
{
const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
const scalar_t weight = data_attn_weight[data_weight_ptr];
const scalar_t h_im = loc_h * spatial_h - 0.5;
const scalar_t w_im = loc_w * spatial_w - 0.5;
*(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
*(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
*(cache_grad_attn_weight+threadIdx.x)=0;
if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
{
ms_deform_attn_col2im_bilinear(
data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
top_grad, weight, grad_value_ptr,
cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
}
__syncthreads();
for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1)
{
if (tid < s) {
const unsigned int xid1 = tid << 1;
const unsigned int xid2 = (tid + s) << 1;
cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
if (tid + (s << 1) < spre)
{
cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)];
cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)];
cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
}
}
__syncthreads();
}
if (tid == 0)
{
atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]);
atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]);
atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]);
}
__syncthreads();
data_weight_ptr += 1;
data_loc_w_ptr += 2;
grad_attn_weight += grad_weight_stride;
grad_sampling_loc += grad_loc_stride;
}
}
}
}
template <typename scalar_t>
__global__ void ms_deformable_col2im_gpu_kernel_gm(const int n,
const scalar_t *grad_col,
const scalar_t *data_value,
const int64_t *data_spatial_shapes,
const int64_t *data_level_start_index,
const scalar_t *data_sampling_loc,
const scalar_t *data_attn_weight,
const int batch_size,
const int spatial_size,
const int num_heads,
const int channels,
const int num_levels,
const int num_query,
const int num_point,
scalar_t *grad_value,
scalar_t *grad_sampling_loc,
scalar_t *grad_attn_weight)
{
CUDA_KERNEL_LOOP(index, n)
{
int _temp = index;
const int c_col = _temp % channels;
_temp /= channels;
const int sampling_index = _temp;
const int m_col = _temp % num_heads;
_temp /= num_heads;
const int q_col = _temp % num_query;
_temp /= num_query;
const int b_col = _temp;
const scalar_t top_grad = grad_col[index];
int data_weight_ptr = sampling_index * num_levels * num_point;
int data_loc_w_ptr = data_weight_ptr << 1;
const int grad_sampling_ptr = data_weight_ptr;
grad_sampling_loc += grad_sampling_ptr << 1;
grad_attn_weight += grad_sampling_ptr;
const int grad_weight_stride = 1;
const int grad_loc_stride = 2;
const int qid_stride = num_heads * channels;
const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
for (int l_col=0; l_col < num_levels; ++l_col)
{
const int level_start_id = data_level_start_index[l_col];
const int spatial_h_ptr = l_col << 1;
const int spatial_h = data_spatial_shapes[spatial_h_ptr];
const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
const scalar_t *data_value_ptr = data_value + value_ptr_offset;
scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
for (int p_col=0; p_col < num_point; ++p_col)
{
const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
const scalar_t weight = data_attn_weight[data_weight_ptr];
const scalar_t h_im = loc_h * spatial_h - 0.5;
const scalar_t w_im = loc_w * spatial_w - 0.5;
if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
{
ms_deform_attn_col2im_bilinear_gm(
data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
top_grad, weight, grad_value_ptr,
grad_sampling_loc, grad_attn_weight);
}
data_weight_ptr += 1;
data_loc_w_ptr += 2;
grad_attn_weight += grad_weight_stride;
grad_sampling_loc += grad_loc_stride;
}
}
}
}
template <typename scalar_t>
void ms_deformable_im2col_cuda(hipStream_t stream,
const scalar_t* data_value,
const int64_t* data_spatial_shapes,
const int64_t* data_level_start_index,
const scalar_t* data_sampling_loc,
const scalar_t* data_attn_weight,
const int batch_size,
const int spatial_size,
const int num_heads,
const int channels,
const int num_levels,
const int num_query,
const int num_point,
scalar_t* data_col)
{
const int num_kernels = batch_size * num_query * num_heads * channels;
const int num_actual_kernels = batch_size * num_query * num_heads * channels;
const int num_threads = CUDA_NUM_THREADS;
hipLaunchKernelGGL(( ms_deformable_im2col_gpu_kernel<scalar_t>)
, dim3(GET_BLOCKS(num_actual_kernels, num_threads)), dim3(num_threads),
0, stream,
num_kernels, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight,
batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, data_col);
hipError_t err = hipGetLastError();
if (err != hipSuccess)
{
printf("error in ms_deformable_im2col_cuda: %s\n", hipGetErrorString(err));
}
}
template <typename scalar_t>
void ms_deformable_col2im_cuda(hipStream_t stream,
const scalar_t* grad_col,
const scalar_t* data_value,
const int64_t * data_spatial_shapes,
const int64_t * data_level_start_index,
const scalar_t * data_sampling_loc,
const scalar_t * data_attn_weight,
const int batch_size,
const int spatial_size,
const int num_heads,
const int channels,
const int num_levels,
const int num_query,
const int num_point,
scalar_t* grad_value,
scalar_t* grad_sampling_loc,
scalar_t* grad_attn_weight)
{
const int num_threads = (channels > CUDA_NUM_THREADS)?CUDA_NUM_THREADS:channels;
const int num_kernels = batch_size * num_query * num_heads * channels;
const int num_actual_kernels = batch_size * num_query * num_heads * channels;
if (channels > 1024)
{
if ((channels & 1023) == 0)
{
hipLaunchKernelGGL(( ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks<scalar_t>)
, dim3(GET_BLOCKS(num_actual_kernels, num_threads)), dim3(num_threads),
num_threads*3*sizeof(scalar_t), stream,
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
}
else
{
hipLaunchKernelGGL(( ms_deformable_col2im_gpu_kernel_gm<scalar_t>)
, dim3(GET_BLOCKS(num_actual_kernels, num_threads)), dim3(num_threads),
0, stream,
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
}
}
else{
switch(channels)
{
case 1:
hipLaunchKernelGGL(( ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 1>)
, dim3(GET_BLOCKS(num_actual_kernels, num_threads)), dim3(num_threads),
0, stream,
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
break;
case 2:
hipLaunchKernelGGL(( ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 2>)
, dim3(GET_BLOCKS(num_actual_kernels, num_threads)), dim3(num_threads),
0, stream,
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
break;
case 4:
hipLaunchKernelGGL(( ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 4>)
, dim3(GET_BLOCKS(num_actual_kernels, num_threads)), dim3(num_threads),
0, stream,
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
break;
case 8:
hipLaunchKernelGGL(( ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 8>)
, dim3(GET_BLOCKS(num_actual_kernels, num_threads)), dim3(num_threads),
0, stream,
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
break;
case 16:
hipLaunchKernelGGL(( ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 16>)
, dim3(GET_BLOCKS(num_actual_kernels, num_threads)), dim3(num_threads),
0, stream,
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
break;
case 32:
hipLaunchKernelGGL(( ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 32>)
, dim3(GET_BLOCKS(num_actual_kernels, num_threads)), dim3(num_threads),
0, stream,
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
break;
case 64:
hipLaunchKernelGGL(( ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 64>)
, dim3(GET_BLOCKS(num_actual_kernels, num_threads)), dim3(num_threads),
0, stream,
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
break;
case 128:
hipLaunchKernelGGL(( ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 128>)
, dim3(GET_BLOCKS(num_actual_kernels, num_threads)), dim3(num_threads),
0, stream,
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
break;
case 256:
hipLaunchKernelGGL(( ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 256>)
, dim3(GET_BLOCKS(num_actual_kernels, num_threads)), dim3(num_threads),
0, stream,
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
break;
case 512:
hipLaunchKernelGGL(( ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 512>)
, dim3(GET_BLOCKS(num_actual_kernels, num_threads)), dim3(num_threads),
0, stream,
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
break;
case 1024:
hipLaunchKernelGGL(( ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 1024>)
, dim3(GET_BLOCKS(num_actual_kernels, num_threads)), dim3(num_threads),
0, stream,
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
break;
default:
if (channels < 64)
{
hipLaunchKernelGGL(( ms_deformable_col2im_gpu_kernel_shm_reduce_v1<scalar_t>)
, dim3(GET_BLOCKS(num_actual_kernels, num_threads)), dim3(num_threads),
num_threads*3*sizeof(scalar_t), stream,
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
}
else
{
hipLaunchKernelGGL(( ms_deformable_col2im_gpu_kernel_shm_reduce_v2<scalar_t>)
, dim3(GET_BLOCKS(num_actual_kernels, num_threads)), dim3(num_threads),
num_threads*3*sizeof(scalar_t), stream,
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
}
}
}
hipError_t err = hipGetLastError();
if (err != hipSuccess)
{
printf("error in ms_deformable_col2im_cuda: %s\n", hipGetErrorString(err));
}
}
\ No newline at end of file
cmake_minimum_required(VERSION 3.14)
project(GroundingDinoORTPlugin)
# 设置 C++ 标准
set(CMAKE_CXX_STANDARD 14)
# 告诉 CMake 使用 hipcc 编译器
set(CMAKE_CXX_COMPILER "hipcc")
set(CMAKE_C_COMPILER "hipcc")
# 1. 寻找 ONNX Runtime 头文件
# 请替换为你环境中实际的 onnxruntime_cxx_api.h 所在路径
set(ONNXRUNTIME_INCLUDE_DIR "/opt/dtk-25.04.2/onnxruntime/include")
include_directories(${ONNXRUNTIME_INCLUDE_DIR})
include_directories(.) # 把当前目录加进去,方便找到 .cuh
# 2. 编译我们的共享库 (.so)
add_library(ms_deform_attn_ort SHARED
ms_deform_attn_ort.hip
)
# =========================================================================
# 【关键修复】告诉 CMake 把 .hip 文件当作 C++ 语言来编译和链接
# =========================================================================
set_source_files_properties(ms_deform_attn_ort.hip PROPERTIES LANGUAGE CXX)
set_target_properties(ms_deform_attn_ort PROPERTIES LINKER_LANGUAGE CXX)
# 编译优化
# target_compile_options(ms_deform_attn_ort PRIVATE -O3 -fPIC)
target_compile_options(ms_deform_attn_ort PRIVATE -O3 -fPIC --gpu-max-threads-per-block=1024)
\ No newline at end of file
// 定义这个宏,关闭 ORT 自动寻找 OrtGetApiBase 的行为
#define ORT_API_MANUAL_INIT
#include <onnxruntime_c_api.h>
#include <onnxruntime_cxx_api.h>
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h> // 【修改 1】引入半精度头文件
#include <vector>
#include <cmath>
#include <algorithm>
// 引入核心计算头文件
#include "ms_deform_im2col_cuda_hip.cuh"
// ============================================================================
// 1. 定义 Kernel
// ============================================================================
struct MsDeformAttnKernel {
const OrtApi* api_;
int im2col_step_;
MsDeformAttnKernel(const OrtApi* api, const OrtKernelInfo* info) : api_(api) {
int64_t step = 64;
OrtStatus* status = api_->KernelInfoGetAttribute_int64(info, "im2col_step_i", &step);
if (status != nullptr) {
api_->ReleaseStatus(status);
im2col_step_ = 64; // 加上兜底赋值
} else {
im2col_step_ = static_cast<int>(step);
}
im2col_step_ = std::max(1, im2col_step_);
}
void CheckStatus(OrtStatus* status) {
if (status != nullptr) {
api_->ReleaseStatus(status);
}
}
std::vector<int64_t> GetTensorDims(const OrtValue* tensor) {
OrtTensorTypeAndShapeInfo* info;
CheckStatus(api_->GetTensorTypeAndShape(tensor, &info));
size_t dim_count;
CheckStatus(api_->GetDimensionsCount(info, &dim_count));
std::vector<int64_t> dims(dim_count);
CheckStatus(api_->GetDimensions(info, dims.data(), dim_count));
api_->ReleaseTensorTypeAndShapeInfo(info);
return dims;
}
void Compute(OrtKernelContext* context) {
OrtStatusPtr status = ComputeV2(context);
if (status != nullptr) {
api_->ReleaseStatus(status);
}
}
OrtStatusPtr ComputeV2(OrtKernelContext* context) {
const OrtValue* value_tensor;
const OrtValue* spatial_shapes_tensor;
const OrtValue* level_start_index_tensor;
const OrtValue* sampling_loc_tensor;
const OrtValue* attn_weight_tensor;
CheckStatus(api_->KernelContext_GetInput(context, 0, &value_tensor));
CheckStatus(api_->KernelContext_GetInput(context, 1, &spatial_shapes_tensor));
CheckStatus(api_->KernelContext_GetInput(context, 2, &level_start_index_tensor));
CheckStatus(api_->KernelContext_GetInput(context, 3, &sampling_loc_tensor));
CheckStatus(api_->KernelContext_GetInput(context, 4, &attn_weight_tensor));
void* value_ptr;
void* spatial_shapes_ptr;
void* level_start_index_ptr;
void* sampling_loc_ptr;
void* attn_weight_ptr;
CheckStatus(api_->GetTensorMutableData(const_cast<OrtValue*>(value_tensor), &value_ptr));
CheckStatus(api_->GetTensorMutableData(const_cast<OrtValue*>(spatial_shapes_tensor), &spatial_shapes_ptr));
CheckStatus(api_->GetTensorMutableData(const_cast<OrtValue*>(level_start_index_tensor), &level_start_index_ptr));
CheckStatus(api_->GetTensorMutableData(const_cast<OrtValue*>(sampling_loc_tensor), &sampling_loc_ptr));
CheckStatus(api_->GetTensorMutableData(const_cast<OrtValue*>(attn_weight_tensor), &attn_weight_ptr));
// 【修改 2】将 reinterpret_cast 的目标类型从 float 改为 __half
const __half* value_data = reinterpret_cast<const __half*>(value_ptr);
const int64_t* spatial_shapes_data = reinterpret_cast<const int64_t*>(spatial_shapes_ptr); // 形状保持 int64
const int64_t* level_start_index_data = reinterpret_cast<const int64_t*>(level_start_index_ptr); // 索引保持 int64
const __half* sampling_loc_data = reinterpret_cast<const __half*>(sampling_loc_ptr);
const __half* attn_weight_data = reinterpret_cast<const __half*>(attn_weight_ptr);
auto value_dims = GetTensorDims(value_tensor);
auto spatial_shapes_dims = GetTensorDims(spatial_shapes_tensor);
auto sampling_loc_dims = GetTensorDims(sampling_loc_tensor);
const int batch = value_dims[0];
const int spatial_size = value_dims[1];
const int num_heads = value_dims[2];
const int channels = value_dims[3];
const int num_levels = spatial_shapes_dims[0];
const int num_query = sampling_loc_dims[1];
const int num_point = sampling_loc_dims[4];
std::vector<int64_t> output_dims = {batch, num_query, num_heads * channels};
OrtValue* output_tensor;
CheckStatus(api_->KernelContext_GetOutput(context, 0, output_dims.data(), output_dims.size(), &output_tensor));
void* output_ptr;
CheckStatus(api_->GetTensorMutableData(output_tensor, &output_ptr));
// 【修改 3】输出指针强转为 __half
__half* output_data = reinterpret_cast<__half*>(output_ptr);
void* stream_ptr;
CheckStatus(api_->KernelContext_GetGPUComputeStream(context, &stream_ptr));
hipStream_t stream = reinterpret_cast<hipStream_t>(stream_ptr);
const int im2col_step_real = std::max(1, std::min(batch, im2col_step_));
const int batch_n = im2col_step_real;
auto per_value_size = spatial_size * num_heads * channels;
auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
auto per_output_size = batch_n * num_query * num_heads * channels;
for (int n = 0; n < batch / im2col_step_real; ++n) {
// 【修改 4】显式传入模板类型 <__half>
ms_deformable_im2col_cuda<__half>(
stream,
value_data + n * im2col_step_real * per_value_size,
spatial_shapes_data,
level_start_index_data,
sampling_loc_data + n * im2col_step_real * per_sample_loc_size,
attn_weight_data + n * im2col_step_real * per_attn_weight_size,
batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
output_data + n * per_output_size
);
}
return nullptr;
}
};
// ============================================================================
// 2. 算子接口封装
// ============================================================================
struct MsDeformAttnCustomOp : Ort::CustomOpBase<MsDeformAttnCustomOp, MsDeformAttnKernel> {
void* CreateKernel(const OrtApi& api, const OrtKernelInfo* info) const {
return new MsDeformAttnKernel(&api, info);
}
OrtStatusPtr CreateKernelV2(const OrtApi& api, const OrtKernelInfo* info, void** op_kernel) const {
*op_kernel = new MsDeformAttnKernel(&api, info);
return nullptr;
}
const char* GetName() const { return "ms_deform_attn"; }
const char* GetExecutionProviderType() const { return "ROCMExecutionProvider"; }
size_t GetInputTypeCount() const { return 5; }
ONNXTensorElementDataType GetInputType(size_t index) const {
// 【修改 5】声明输入 0, 3, 4 为 FLOAT16
if (index == 0 || index == 3 || index == 4) return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16;
return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
}
size_t GetOutputTypeCount() const { return 1; }
ONNXTensorElementDataType GetOutputType(size_t index) const {
// 【修改 6】声明输出为 FLOAT16
return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16;
}
};
// ============================================================================
// 3. 注册入口
// ============================================================================
MsDeformAttnCustomOp ms_deform_attn_op;
extern "C" OrtStatus* RegisterCustomOps(OrtSessionOptions* options, const OrtApiBase* api) {
// 手动初始化 C++ API,把 Python 传来的真实 API 指针交给 ORT
Ort::InitApi(api->GetApi(ORT_API_VERSION));
OrtCustomOpDomain* domain = nullptr;
const OrtApi* ortApi = api->GetApi(ORT_API_VERSION);
if (auto status = ortApi->CreateCustomOpDomain("custom", &domain)) { return status; }
if (auto status = ortApi->CustomOpDomain_Add(domain, &ms_deform_attn_op)) { return status; }
return ortApi->AddCustomOpDomain(options, domain);
}
\ No newline at end of file
// !!! This is a file automatically generated by hipify!!!
#include "hip/hip_runtime.h"
#include <hip/hip_fp16.h> // 【修改 1】引入半精度支持
/*!
**************************************************************************
* Deformable DETR (Inference Only - Forward Pass)
**************************************************************************
*/
#include <cstdio>
#include <algorithm>
#include <cstring>
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
i < (n); \
i += blockDim.x * gridDim.x)
const int CUDA_NUM_THREADS = 1024;
inline int GET_BLOCKS(const int N, const int num_threads)
{
return (N + num_threads - 1) / num_threads;
}
// =================================================================================
// 核心数学计算:双线性插值 (Device 侧)
// 【修改 2】返回值强制改为 float,确保累加精度;坐标参数 h 和 w 也改为 float
// =================================================================================
template <typename scalar_t>
__device__ float ms_deform_attn_im2col_bilinear(const scalar_t* &bottom_data,
const int &height, const int &width, const int &nheads, const int &channels,
const float &h, const float &w, const int &m, const int &c)
{
const int h_low = floor(h);
const int w_low = floor(w);
const int h_high = h_low + 1;
const int w_high = w_low + 1;
// 这里的系数计算全程在 FP32 下进行
const float lh = h - h_low;
const float lw = w - w_low;
const float hh = 1.0f - lh, hw = 1.0f - lw;
const int w_stride = nheads * channels;
const int h_stride = width * w_stride;
const int h_low_ptr_offset = h_low * h_stride;
const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
const int w_low_ptr_offset = w_low * w_stride;
const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
const int base_ptr = m * channels + c;
// 【修改 3】从显存读取 FP16 (scalar_t) 数据后,立即转换为 float
float v1 = 0.0f;
if (h_low >= 0 && w_low >= 0)
{
const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
v1 = static_cast<float>(bottom_data[ptr1]);
}
float v2 = 0.0f;
if (h_low >= 0 && w_high <= width - 1)
{
const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
v2 = static_cast<float>(bottom_data[ptr2]);
}
float v3 = 0.0f;
if (h_high <= height - 1 && w_low >= 0)
{
const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
v3 = static_cast<float>(bottom_data[ptr3]);
}
float v4 = 0.0f;
if (h_high <= height - 1 && w_high <= width - 1)
{
const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
v4 = static_cast<float>(bottom_data[ptr4]);
}
const float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
// FP32 下的加权求和,防止溢出或下溢
const float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
return val;
}
// =================================================================================
// 核心调度内核:循环多尺度特征与采样点 (Global 侧)
// =================================================================================
template <typename scalar_t>
__global__ void ms_deformable_im2col_gpu_kernel(const int n,
const scalar_t *data_value,
const int64_t *data_spatial_shapes,
const int64_t *data_level_start_index,
const scalar_t *data_sampling_loc,
const scalar_t *data_attn_weight,
const int batch_size,
const int spatial_size,
const int num_heads,
const int channels,
const int num_levels,
const int num_query,
const int num_point,
scalar_t *data_col)
{
CUDA_KERNEL_LOOP(index, n)
{
int _temp = index;
const int c_col = _temp % channels;
_temp /= channels;
const int sampling_index = _temp;
const int m_col = _temp % num_heads;
_temp /= num_heads;
const int q_col = _temp % num_query;
_temp /= num_query;
const int b_col = _temp;
scalar_t *data_col_ptr = data_col + index;
int data_weight_ptr = sampling_index * num_levels * num_point;
int data_loc_w_ptr = data_weight_ptr << 1;
const int qid_stride = num_heads * channels;
const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
// 【修改 4】特征累加器必须是 float,防止多次累加造成 16 位浮点数溢出
float col = 0.0f;
for (int l_col=0; l_col < num_levels; ++l_col)
{
const int level_start_id = data_level_start_index[l_col];
const int spatial_h_ptr = l_col << 1;
const int spatial_h = data_spatial_shapes[spatial_h_ptr];
const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
const scalar_t *data_value_ptr = data_value + (data_value_ptr_init_offset + level_start_id * qid_stride);
for (int p_col=0; p_col < num_point; ++p_col)
{
// 【修改 5】极其关键:读取采样坐标和注意力权重时,立刻强转为 float
const float loc_w = static_cast<float>(data_sampling_loc[data_loc_w_ptr]);
const float loc_h = static_cast<float>(data_sampling_loc[data_loc_w_ptr + 1]);
const float weight = static_cast<float>(data_attn_weight[data_weight_ptr]);
// 在 FP32 下计算实际坐标,保证精度
const float h_im = loc_h * spatial_h - 0.5f;
const float w_im = loc_w * spatial_w - 0.5f;
if (h_im > -1.0f && w_im > -1.0f && h_im < spatial_h && w_im < spatial_w)
{
// 这里的 ms_deform_attn_im2col_bilinear 已经改写为返回 float
col += ms_deform_attn_im2col_bilinear(data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col) * weight;
}
data_weight_ptr += 1;
data_loc_w_ptr += 2;
}
}
// 【修改 6】算完之后,把 float 强转回 FP16 (scalar_t) 写回显存
*data_col_ptr = static_cast<scalar_t>(col);
}
}
// =================================================================================
// C++ 主机侧调用入口 (Launcher)
// =================================================================================
template <typename scalar_t>
void ms_deformable_im2col_cuda(hipStream_t stream,
const scalar_t* data_value,
const int64_t* data_spatial_shapes,
const int64_t* data_level_start_index,
const scalar_t* data_sampling_loc,
const scalar_t* data_attn_weight,
const int batch_size,
const int spatial_size,
const int num_heads,
const int channels,
const int num_levels,
const int num_query,
const int num_point,
scalar_t* data_col)
{
const int num_kernels = batch_size * num_query * num_heads * channels;
const int num_actual_kernels = batch_size * num_query * num_heads * channels;
const int num_threads = CUDA_NUM_THREADS;
hipLaunchKernelGGL(( ms_deformable_im2col_gpu_kernel<scalar_t>)
, dim3(GET_BLOCKS(num_actual_kernels, num_threads)), dim3(num_threads),
0, stream,
num_kernels, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight,
batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, data_col);
hipError_t err = hipGetLastError();
if (err != hipSuccess)
{
printf("error in ms_deformable_im2col_cuda: %s\n", hipGetErrorString(err));
}
}
\ No newline at end of file
cmake_minimum_required(VERSION 3.14)
project(GroundingDinoORTPlugin)
# 设置 C++ 标准
set(CMAKE_CXX_STANDARD 14)
# 告诉 CMake 使用 hipcc 编译器
set(CMAKE_CXX_COMPILER "hipcc")
set(CMAKE_C_COMPILER "hipcc")
# 1. 寻找 ONNX Runtime 头文件
# 请替换为你环境中实际的 onnxruntime_cxx_api.h 所在路径
set(ONNXRUNTIME_INCLUDE_DIR "/opt/dtk-25.04.2/onnxruntime/include")
include_directories(${ONNXRUNTIME_INCLUDE_DIR})
include_directories(.) # 把当前目录加进去,方便找到 .cuh
# 2. 编译我们的共享库 (.so)
add_library(ms_deform_attn_ort SHARED
ms_deform_attn_ort.hip
)
# =========================================================================
# 【关键修复】告诉 CMake 把 .hip 文件当作 C++ 语言来编译和链接
# =========================================================================
set_source_files_properties(ms_deform_attn_ort.hip PROPERTIES LANGUAGE CXX)
set_target_properties(ms_deform_attn_ort PROPERTIES LINKER_LANGUAGE CXX)
# 编译优化
# target_compile_options(ms_deform_attn_ort PRIVATE -O3 -fPIC)
target_compile_options(ms_deform_attn_ort PRIVATE -O3 -fPIC --gpu-max-threads-per-block=1024)
\ No newline at end of file
// 关键修复 1:定义这个宏,关闭 ORT 自动寻找 OrtGetApiBase 的行为
#define ORT_API_MANUAL_INIT
#include <onnxruntime_c_api.h>
#include <onnxruntime_cxx_api.h>
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <vector>
#include <cmath>
#include <algorithm>
// 引入核心计算头文件
#include "ms_deform_im2col_cuda_hip.cuh"
// ============================================================================
// 1. 定义 Kernel
// ============================================================================
struct MsDeformAttnKernel {
const OrtApi* api_;
int im2col_step_;
// ========================================================================
// 【终极优化:对象级显存池】作为成员变量,伴随算子生命周期永久存在
// ========================================================================
float* v_f32_ = nullptr;
size_t v_cap_ = 0;
float* sl_f32_ = nullptr;
size_t sl_cap_ = 0;
float* aw_f32_ = nullptr;
size_t aw_cap_ = 0;
float* out_f32_ = nullptr;
size_t out_cap_ = 0;
MsDeformAttnKernel(const OrtApi* api, const OrtKernelInfo* info) : api_(api) {
int64_t step = 64;
OrtStatus* status = api_->KernelInfoGetAttribute_int64(info, "im2col_step_i", &step);
if (status != nullptr) {
api_->ReleaseStatus(status);
im2col_step_ = 64;
} else {
im2col_step_ = static_cast<int>(step);
}
im2col_step_ = std::max(1, im2col_step_);
}
// 析构函数:模型被卸载时,自动释放显存,绝不内存泄漏!
~MsDeformAttnKernel() {
if (v_f32_) hipFree(v_f32_);
if (sl_f32_) hipFree(sl_f32_);
if (aw_f32_) hipFree(aw_f32_);
if (out_f32_) hipFree(out_f32_);
}
void CheckStatus(OrtStatus* status) {
if (status != nullptr) {
api_->ReleaseStatus(status);
}
}
std::vector<int64_t> GetTensorDims(const OrtValue* tensor) {
OrtTensorTypeAndShapeInfo* info;
CheckStatus(api_->GetTensorTypeAndShape(tensor, &info));
size_t dim_count;
CheckStatus(api_->GetDimensionsCount(info, &dim_count));
std::vector<int64_t> dims(dim_count);
CheckStatus(api_->GetDimensions(info, dims.data(), dim_count));
api_->ReleaseTensorTypeAndShapeInfo(info);
return dims;
}
void Compute(OrtKernelContext* context) {
OrtStatusPtr status = ComputeV2(context);
if (status != nullptr) {
api_->ReleaseStatus(status);
}
}
OrtStatusPtr ComputeV2(OrtKernelContext* context) {
const OrtValue* value_tensor;
const OrtValue* spatial_shapes_tensor;
const OrtValue* level_start_index_tensor;
const OrtValue* sampling_loc_tensor;
const OrtValue* attn_weight_tensor;
CheckStatus(api_->KernelContext_GetInput(context, 0, &value_tensor));
CheckStatus(api_->KernelContext_GetInput(context, 1, &spatial_shapes_tensor));
CheckStatus(api_->KernelContext_GetInput(context, 2, &level_start_index_tensor));
CheckStatus(api_->KernelContext_GetInput(context, 3, &sampling_loc_tensor));
CheckStatus(api_->KernelContext_GetInput(context, 4, &attn_weight_tensor));
void *value_ptr, *spatial_shapes_ptr, *level_start_index_ptr, *sampling_loc_ptr, *attn_weight_ptr;
CheckStatus(api_->GetTensorMutableData(const_cast<OrtValue*>(value_tensor), &value_ptr));
CheckStatus(api_->GetTensorMutableData(const_cast<OrtValue*>(spatial_shapes_tensor), &spatial_shapes_ptr));
CheckStatus(api_->GetTensorMutableData(const_cast<OrtValue*>(level_start_index_tensor), &level_start_index_ptr));
CheckStatus(api_->GetTensorMutableData(const_cast<OrtValue*>(sampling_loc_tensor), &sampling_loc_ptr));
CheckStatus(api_->GetTensorMutableData(const_cast<OrtValue*>(attn_weight_tensor), &attn_weight_ptr));
auto value_dims = GetTensorDims(value_tensor);
auto spatial_shapes_dims = GetTensorDims(spatial_shapes_tensor);
auto sampling_loc_dims = GetTensorDims(sampling_loc_tensor);
const int batch = value_dims[0];
const int spatial_size = value_dims[1];
const int num_heads = value_dims[2];
const int channels = value_dims[3];
const int num_levels = spatial_shapes_dims[0];
const int num_query = sampling_loc_dims[1];
const int num_point = sampling_loc_dims[4];
std::vector<int64_t> output_dims = {batch, num_query, num_heads * channels};
OrtValue* output_tensor;
CheckStatus(api_->KernelContext_GetOutput(context, 0, output_dims.data(), output_dims.size(), &output_tensor));
void* output_ptr;
CheckStatus(api_->GetTensorMutableData(output_tensor, &output_ptr));
void* stream_ptr;
CheckStatus(api_->KernelContext_GetGPUComputeStream(context, &stream_ptr));
hipStream_t stream = reinterpret_cast<hipStream_t>(stream_ptr);
// ============================================================================
// 【极速显存调度】:只在第一帧分配显存,或者当模型输入图片变大时自动扩容!
// ============================================================================
size_t v_bytes = batch * spatial_size * num_heads * channels * sizeof(float);
size_t sl_bytes = batch * num_query * num_heads * num_levels * num_point * 2 * sizeof(float);
size_t aw_bytes = batch * num_query * num_heads * num_levels * num_point * sizeof(float);
size_t out_bytes = batch * num_query * num_heads * channels * sizeof(float);
bool need_sync = (v_bytes > v_cap_) || (sl_bytes > sl_cap_) || (aw_bytes > aw_cap_) || (out_bytes > out_cap_);
if (need_sync) {
// 如果遇到更大的新分辨率,安全释放旧内存重新分配
hipStreamSynchronize(stream);
if (v_bytes > v_cap_) { if (v_f32_) hipFree(v_f32_); hipMalloc((void**)&v_f32_, v_bytes); v_cap_ = v_bytes; }
if (sl_bytes > sl_cap_) { if (sl_f32_) hipFree(sl_f32_); hipMalloc((void**)&sl_f32_, sl_bytes); sl_cap_ = sl_bytes; }
if (aw_bytes > aw_cap_) { if (aw_f32_) hipFree(aw_f32_); hipMalloc((void**)&aw_f32_, aw_bytes); aw_cap_ = aw_bytes; }
if (out_bytes > out_cap_) { if (out_f32_) hipFree(out_f32_); hipMalloc((void**)&out_f32_, out_bytes); out_cap_ = out_bytes; }
}
// ============================================================================
// 数据流转:FP16 -> FP32 (计算) -> FP16
// ============================================================================
int v_cnt = v_bytes / sizeof(float);
int sl_cnt = sl_bytes / sizeof(float);
int aw_cnt = aw_bytes / sizeof(float);
int out_cnt = out_bytes / sizeof(float);
RunFastCast<__half, float>(stream, reinterpret_cast<const __half*>(value_ptr), v_f32_, v_cnt);
RunFastCast<__half, float>(stream, reinterpret_cast<const __half*>(sampling_loc_ptr), sl_f32_, sl_cnt);
RunFastCast<__half, float>(stream, reinterpret_cast<const __half*>(attn_weight_ptr), aw_f32_, aw_cnt);
const int64_t* spatial_shapes_data = reinterpret_cast<const int64_t*>(spatial_shapes_ptr);
const int64_t* level_start_index_data = reinterpret_cast<const int64_t*>(level_start_index_ptr);
const int im2col_step_real = std::max(1, std::min(batch, im2col_step_));
const int batch_n = im2col_step_real;
auto per_value_size = spatial_size * num_heads * channels;
auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
auto per_output_size = batch_n * num_query * num_heads * channels;
for (int n = 0; n < batch / im2col_step_real; ++n) {
ms_deformable_im2col_cuda_fp32(
stream,
v_f32_ + n * im2col_step_real * per_value_size,
spatial_shapes_data,
level_start_index_data,
sl_f32_ + n * im2col_step_real * per_sample_loc_size,
aw_f32_ + n * im2col_step_real * per_attn_weight_size,
batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
out_f32_ + n * per_output_size
);
}
RunFastCast<float, __half>(stream, out_f32_, reinterpret_cast<__half*>(output_ptr), out_cnt);
// 我们这里不再去调用 hipFree 释放内存!把内存养着,下一帧接着用!
return nullptr;
}
};
// ============================================================================
// 2. 算子接口封装
// ============================================================================
struct MsDeformAttnCustomOp : Ort::CustomOpBase<MsDeformAttnCustomOp, MsDeformAttnKernel> {
void* CreateKernel(const OrtApi& api, const OrtKernelInfo* info) const {
return new MsDeformAttnKernel(&api, info);
}
OrtStatusPtr CreateKernelV2(const OrtApi& api, const OrtKernelInfo* info, void** op_kernel) const {
*op_kernel = new MsDeformAttnKernel(&api, info);
return nullptr;
}
const char* GetName() const { return "ms_deform_attn"; }
const char* GetExecutionProviderType() const { return "ROCMExecutionProvider"; }
size_t GetInputTypeCount() const { return 5; }
ONNXTensorElementDataType GetInputType(size_t index) const {
if (index == 0 || index == 3 || index == 4) return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16;
return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
}
size_t GetOutputTypeCount() const { return 1; }
ONNXTensorElementDataType GetOutputType(size_t index) const {
return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16;
}
};
// ============================================================================
// 3. 注册入口
// ============================================================================
MsDeformAttnCustomOp ms_deform_attn_op;
extern "C" OrtStatus* RegisterCustomOps(OrtSessionOptions* options, const OrtApiBase* api) {
Ort::InitApi(api->GetApi(ORT_API_VERSION));
OrtCustomOpDomain* domain = nullptr;
const OrtApi* ortApi = api->GetApi(ORT_API_VERSION);
if (auto status = ortApi->CreateCustomOpDomain("custom", &domain)) { return status; }
if (auto status = ortApi->CustomOpDomain_Add(domain, &ms_deform_attn_op)) { return status; }
return ortApi->AddCustomOpDomain(options, domain);
}
\ No newline at end of file
#include "hip/hip_runtime.h"
#include <hip/hip_fp16.h>
#include <cstdio>
#include <algorithm>
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
const int CUDA_NUM_THREADS = 1024;
inline int GET_BLOCKS(const int N, const int num_threads) {
return (N + num_threads - 1) / num_threads;
}
// ============================================================================
// 【方案 B】极速就地转换核函数 (FP16 <-> FP32)
// ============================================================================
template <typename T_IN, typename T_OUT>
__global__ void FastCastKernel(const T_IN* input, T_OUT* output, int n) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n) {
output[i] = static_cast<T_OUT>(input[i]);
}
}
template <typename T_IN, typename T_OUT>
void RunFastCast(hipStream_t stream, const T_IN* input, T_OUT* output, int n) {
if (n <= 0) return;
const int threads = 512;
const int blocks = (n + threads - 1) / threads;
// 【关键修复 1】使用 <<< >>> 语法,完美避开 hipLaunchKernelGGL 宏解析带逗号模板的 Bug
FastCastKernel<T_IN, T_OUT><<<blocks, threads, 0, stream>>>(input, output, n);
}
// ============================================================================
// FP32 双线性插值计算 (Device)
// ============================================================================
__device__ float ms_deform_attn_im2col_bilinear_fp32(const float* &bottom_data,
const int &height, const int &width, const int &nheads, const int &channels,
const float &h, const float &w, const int &m, const int &c)
{
const int h_low = floor(h);
const int w_low = floor(w);
const int h_high = h_low + 1;
const int w_high = w_low + 1;
const float lh = h - h_low;
const float lw = w - w_low;
const float hh = 1.0f - lh, hw = 1.0f - lw;
const int w_stride = nheads * channels;
const int h_stride = width * w_stride;
const int h_low_ptr_offset = h_low * h_stride;
const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
const int w_low_ptr_offset = w_low * w_stride;
const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
const int base_ptr = m * channels + c;
float v1 = 0;
if (h_low >= 0 && w_low >= 0) {
v1 = bottom_data[h_low_ptr_offset + w_low_ptr_offset + base_ptr];
}
float v2 = 0;
if (h_low >= 0 && w_high <= width - 1) {
v2 = bottom_data[h_low_ptr_offset + w_high_ptr_offset + base_ptr];
}
float v3 = 0;
if (h_high <= height - 1 && w_low >= 0) {
v3 = bottom_data[h_high_ptr_offset + w_low_ptr_offset + base_ptr];
}
float v4 = 0;
if (h_high <= height - 1 && w_high <= width - 1) {
v4 = bottom_data[h_high_ptr_offset + w_high_ptr_offset + base_ptr];
}
return (hh * hw * v1 + hh * lw * v2 + lh * hw * v3 + lh * lw * v4);
}
// ============================================================================
// FP32 核心计算内核 (Global)
// ============================================================================
__global__ void ms_deformable_im2col_gpu_kernel_fp32(const int n,
const float *data_value,
const int64_t *data_spatial_shapes,
const int64_t *data_level_start_index,
const float *data_sampling_loc,
const float *data_attn_weight,
const int batch_size,
const int spatial_size,
const int num_heads,
const int channels,
const int num_levels,
const int num_query,
const int num_point,
float *data_col)
{
CUDA_KERNEL_LOOP(index, n)
{
int _temp = index;
const int c_col = _temp % channels; _temp /= channels;
const int sampling_index = _temp;
const int m_col = _temp % num_heads; _temp /= num_heads;
const int q_col = _temp % num_query; _temp /= num_query;
const int b_col = _temp;
float *data_col_ptr = data_col + index;
int data_weight_ptr = sampling_index * num_levels * num_point;
int data_loc_w_ptr = data_weight_ptr << 1;
const int qid_stride = num_heads * channels;
const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
float col = 0.0f;
for (int l_col=0; l_col < num_levels; ++l_col)
{
const int level_start_id = data_level_start_index[l_col];
const int spatial_h_ptr = l_col << 1;
const int spatial_h = data_spatial_shapes[spatial_h_ptr];
const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
const float *data_value_ptr = data_value + (data_value_ptr_init_offset + level_start_id * qid_stride);
for (int p_col=0; p_col < num_point; ++p_col)
{
const float loc_w = data_sampling_loc[data_loc_w_ptr];
const float loc_h = data_sampling_loc[data_loc_w_ptr + 1];
const float weight = data_attn_weight[data_weight_ptr];
const float h_im = loc_h * spatial_h - 0.5f;
const float w_im = loc_w * spatial_w - 0.5f;
if (h_im > -1.0f && w_im > -1.0f && h_im < spatial_h && w_im < spatial_w)
{
col += ms_deform_attn_im2col_bilinear_fp32(data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col) * weight;
}
data_weight_ptr += 1;
data_loc_w_ptr += 2;
}
}
*data_col_ptr = col;
}
}
// ============================================================================
// 算子启动函数 (Launcher)
// ============================================================================
void ms_deformable_im2col_cuda_fp32(hipStream_t stream,
const float* data_value,
const int64_t* data_spatial_shapes,
const int64_t* data_level_start_index,
const float* data_sampling_loc,
const float* data_attn_weight,
const int batch_size,
const int spatial_size,
const int num_heads,
const int channels,
const int num_levels,
const int num_query,
const int num_point,
float* data_col)
{
const int num_kernels = batch_size * num_query * num_heads * channels;
const int num_threads = CUDA_NUM_THREADS;
// 【关键修复 2】同样改用 <<< >>> 语法,避免宏展开的坑
ms_deformable_im2col_gpu_kernel_fp32<<<GET_BLOCKS(num_kernels, num_threads), num_threads, 0, stream>>>(
num_kernels, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight,
batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, data_col);
}
\ No newline at end of file
cmake_minimum_required(VERSION 3.14)
project(GroundingDinoORTPlugin)
# 设置 C++ 标准
set(CMAKE_CXX_STANDARD 14)
# 告诉 CMake 使用 hipcc 编译器
set(CMAKE_CXX_COMPILER "hipcc")
set(CMAKE_C_COMPILER "hipcc")
# 1. 寻找 ONNX Runtime 头文件
# 请替换为你环境中实际的 onnxruntime_cxx_api.h 所在路径
set(ONNXRUNTIME_INCLUDE_DIR "/opt/dtk-25.04.2/onnxruntime/include")
include_directories(${ONNXRUNTIME_INCLUDE_DIR})
include_directories(.) # 把当前目录加进去,方便找到 .cuh
# 2. 编译我们的共享库 (.so)
add_library(ms_deform_attn_ort SHARED
ms_deform_attn_ort.hip
)
# =========================================================================
# 【关键修复】告诉 CMake 把 .hip 文件当作 C++ 语言来编译和链接
# =========================================================================
set_source_files_properties(ms_deform_attn_ort.hip PROPERTIES LANGUAGE CXX)
set_target_properties(ms_deform_attn_ort PROPERTIES LINKER_LANGUAGE CXX)
# 编译优化
# target_compile_options(ms_deform_attn_ort PRIVATE -O3 -fPIC)
target_compile_options(ms_deform_attn_ort PRIVATE -O3 -fPIC --gpu-max-threads-per-block=1024)
\ No newline at end of file
#define ORT_API_MANUAL_INIT
#include <onnxruntime_c_api.h>
#include <onnxruntime_cxx_api.h>
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <vector>
#include <cmath>
#include <algorithm>
// 引入核心计算头文件
#include "ms_deform_im2col_cuda_hip.cuh"
// ============================================================================
// 1. 定义 Kernel (极致干净的 Native FP16 接口)
// ============================================================================
struct MsDeformAttnKernel {
const OrtApi* api_;
int im2col_step_;
MsDeformAttnKernel(const OrtApi* api, const OrtKernelInfo* info) : api_(api) {
int64_t step = 64;
OrtStatus* status = api_->KernelInfoGetAttribute_int64(info, "im2col_step_i", &step);
if (status != nullptr) {
api_->ReleaseStatus(status);
im2col_step_ = 64;
} else {
im2col_step_ = static_cast<int>(step);
}
im2col_step_ = std::max(1, im2col_step_);
}
void CheckStatus(OrtStatus* status) {
if (status != nullptr) api_->ReleaseStatus(status);
}
std::vector<int64_t> GetTensorDims(const OrtValue* tensor) {
OrtTensorTypeAndShapeInfo* info;
CheckStatus(api_->GetTensorTypeAndShape(tensor, &info));
size_t dim_count;
CheckStatus(api_->GetDimensionsCount(info, &dim_count));
std::vector<int64_t> dims(dim_count);
CheckStatus(api_->GetDimensions(info, dims.data(), dim_count));
api_->ReleaseTensorTypeAndShapeInfo(info);
return dims;
}
void Compute(OrtKernelContext* context) {
OrtStatusPtr status = ComputeV2(context);
if (status != nullptr) api_->ReleaseStatus(status);
}
OrtStatusPtr ComputeV2(OrtKernelContext* context) {
const OrtValue *v_t, *ss_t, *lsi_t, *sl_t, *aw_t;
CheckStatus(api_->KernelContext_GetInput(context, 0, &v_t));
CheckStatus(api_->KernelContext_GetInput(context, 1, &ss_t));
CheckStatus(api_->KernelContext_GetInput(context, 2, &lsi_t));
CheckStatus(api_->KernelContext_GetInput(context, 3, &sl_t));
CheckStatus(api_->KernelContext_GetInput(context, 4, &aw_t));
void *v_p, *ss_p, *lsi_p, *sl_p, *aw_p;
CheckStatus(api_->GetTensorMutableData(const_cast<OrtValue*>(v_t), &v_p));
CheckStatus(api_->GetTensorMutableData(const_cast<OrtValue*>(ss_t), &ss_p));
CheckStatus(api_->GetTensorMutableData(const_cast<OrtValue*>(lsi_t), &lsi_p));
CheckStatus(api_->GetTensorMutableData(const_cast<OrtValue*>(sl_t), &sl_p));
CheckStatus(api_->GetTensorMutableData(const_cast<OrtValue*>(aw_t), &aw_p));
auto v_dims = GetTensorDims(v_t);
auto sl_dims = GetTensorDims(sl_t);
auto ss_dims = GetTensorDims(ss_t);
const int batch = v_dims[0];
const int spatial_size = v_dims[1];
const int num_heads = v_dims[2];
const int channels = v_dims[3];
const int num_levels = ss_dims[0];
const int num_query = sl_dims[1];
const int num_point = sl_dims[4];
std::vector<int64_t> output_dims = {batch, num_query, num_heads * channels};
OrtValue* output_tensor;
CheckStatus(api_->KernelContext_GetOutput(context, 0, output_dims.data(), output_dims.size(), &output_tensor));
void* output_ptr;
CheckStatus(api_->GetTensorMutableData(output_tensor, &output_ptr));
void* stream_ptr;
CheckStatus(api_->KernelContext_GetGPUComputeStream(context, &stream_ptr));
hipStream_t stream = reinterpret_cast<hipStream_t>(stream_ptr);
// 强转为 __half 原生指针
const __half* v_f16 = reinterpret_cast<const __half*>(v_p);
const __half* sl_f16 = reinterpret_cast<const __half*>(sl_p);
const __half* aw_f16 = reinterpret_cast<const __half*>(aw_p);
__half* out_f16 = reinterpret_cast<__half*>(output_ptr);
const int64_t* ss_i64 = reinterpret_cast<const int64_t*>(ss_p);
const int64_t* lsi_i64 = reinterpret_cast<const int64_t*>(lsi_p);
const int im2col_step_real = std::max(1, std::min(batch, im2col_step_));
const int batch_n = im2col_step_real;
auto per_value_size = spatial_size * num_heads * channels;
auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
auto per_output_size = batch_n * num_query * num_heads * channels;
// 核心突破:摒弃所有 Cast,直接传给 half2 极速内核!
for (int n = 0; n < batch / im2col_step_real; ++n) {
ms_deformable_im2col_cuda_half2(
stream,
v_f16 + n * im2col_step_real * per_value_size,
ss_i64,
lsi_i64,
sl_f16 + n * im2col_step_real * per_sample_loc_size,
aw_f16 + n * im2col_step_real * per_attn_weight_size,
batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
out_f16 + n * per_output_size
);
}
return nullptr;
}
};
// ============================================================================
// 2. 算子接口封装
// ============================================================================
struct MsDeformAttnCustomOp : Ort::CustomOpBase<MsDeformAttnCustomOp, MsDeformAttnKernel> {
void* CreateKernel(const OrtApi& api, const OrtKernelInfo* info) const {
return new MsDeformAttnKernel(&api, info);
}
OrtStatusPtr CreateKernelV2(const OrtApi& api, const OrtKernelInfo* info, void** op_kernel) const {
*op_kernel = new MsDeformAttnKernel(&api, info);
return nullptr;
}
const char* GetName() const { return "ms_deform_attn"; }
const char* GetExecutionProviderType() const { return "ROCMExecutionProvider"; }
size_t GetInputTypeCount() const { return 5; }
ONNXTensorElementDataType GetInputType(size_t index) const {
if (index == 0 || index == 3 || index == 4) return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16;
return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
}
size_t GetOutputTypeCount() const { return 1; }
ONNXTensorElementDataType GetOutputType(size_t index) const {
return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16;
}
};
// ============================================================================
// 3. 注册入口
// ============================================================================
MsDeformAttnCustomOp ms_deform_attn_op;
extern "C" OrtStatus* RegisterCustomOps(OrtSessionOptions* options, const OrtApiBase* api) {
Ort::InitApi(api->GetApi(ORT_API_VERSION));
OrtCustomOpDomain* domain = nullptr;
const OrtApi* ortApi = api->GetApi(ORT_API_VERSION);
if (auto status = ortApi->CreateCustomOpDomain("custom", &domain)) { return status; }
if (auto status = ortApi->CustomOpDomain_Add(domain, &ms_deform_attn_op)) { return status; }
return ortApi->AddCustomOpDomain(options, domain);
}
\ No newline at end of file
#include "hip/hip_runtime.h"
#include <hip/hip_fp16.h>
#include <cstdio>
#include <algorithm>
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
const int CUDA_NUM_THREADS = 1024;
inline int GET_BLOCKS(const int N, const int num_threads) {
return (N + num_threads - 1) / num_threads;
}
// ============================================================================
// 【方案 A】极致向量化双线性插值 (一次计算 2 个 Channel)
// ============================================================================
__device__ float2 ms_deform_attn_im2col_bilinear_half2(const half2* bottom_data,
const int height, const int width, const int nheads, const int channels_half2,
const float h, const float w, const int m, const int c_half2)
{
const int h_low = floor(h);
const int w_low = floor(w);
const int h_high = h_low + 1;
const int w_high = w_low + 1;
const float lh = h - h_low;
const float lw = w - w_low;
const float hh = 1.0f - lh, hw = 1.0f - lw;
// 注意:这里的 stride 都是以 half2 为单位计算的
const int w_stride = nheads * channels_half2;
const int h_stride = width * w_stride;
const int h_low_ptr_offset = h_low * h_stride;
const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
const int w_low_ptr_offset = w_low * w_stride;
const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
const int base_ptr = m * channels_half2 + c_half2;
// 生成一个包含两个 0.0 的 half2
half2 zero_h2 = __float2half2_rn(0.0f);
// 极其高效的 32-bit (half2) 一次性拉取
half2 v1_h2 = (h_low >= 0 && w_low >= 0) ? bottom_data[h_low_ptr_offset + w_low_ptr_offset + base_ptr] : zero_h2;
half2 v2_h2 = (h_low >= 0 && w_high <= width - 1) ? bottom_data[h_low_ptr_offset + w_high_ptr_offset + base_ptr] : zero_h2;
half2 v3_h2 = (h_high <= height - 1 && w_low >= 0) ? bottom_data[h_high_ptr_offset + w_low_ptr_offset + base_ptr] : zero_h2;
half2 v4_h2 = (h_high <= height - 1 && w_high <= width - 1) ? bottom_data[h_high_ptr_offset + w_high_ptr_offset + base_ptr] : zero_h2;
// 内部无损转换为 float2 进行高精度累加
float2 v1 = __half22float2(v1_h2);
float2 v2 = __half22float2(v2_h2);
float2 v3 = __half22float2(v3_h2);
float2 v4 = __half22float2(v4_h2);
float2 res;
res.x = hh * hw * v1.x + hh * lw * v2.x + lh * hw * v3.x + lh * lw * v4.x;
res.y = hh * hw * v1.y + hh * lw * v2.y + lh * hw * v3.y + lh * lw * v4.y;
return res;
}
// ============================================================================
// 【方案 A】原生 FP16 + Half2 向量化计算内核
// ============================================================================
__global__ void ms_deformable_im2col_gpu_kernel_half2(const int n,
const half2 *data_value,
const int64_t *data_spatial_shapes,
const int64_t *data_level_start_index,
const __half *data_sampling_loc,
const __half *data_attn_weight,
const int batch_size,
const int spatial_size,
const int num_heads,
const int channels,
const int num_levels,
const int num_query,
const int num_point,
half2 *data_col)
{
// n 是 (batch * num_query * num_heads * (channels / 2))
CUDA_KERNEL_LOOP(index, n)
{
int _temp = index;
const int channels_half2 = channels / 2; // 通道数砍半
const int c_col_half2 = _temp % channels_half2; _temp /= channels_half2;
const int sampling_index = _temp;
const int m_col = _temp % num_heads; _temp /= num_heads;
const int q_col = _temp % num_query; _temp /= num_query;
const int b_col = _temp;
half2 *data_col_ptr = data_col + index;
int data_weight_ptr = sampling_index * num_levels * num_point;
int data_loc_w_ptr = data_weight_ptr << 1;
const int qid_stride_half2 = num_heads * channels_half2;
const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride_half2;
// 用 float2 当作累加器,杜绝 FP16 加法溢出
float2 col;
col.x = 0.0f; col.y = 0.0f;
for (int l_col=0; l_col < num_levels; ++l_col)
{
const int level_start_id = data_level_start_index[l_col];
const int spatial_h_ptr = l_col << 1;
const int spatial_h = data_spatial_shapes[spatial_h_ptr];
const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
const half2 *data_value_ptr = data_value + (data_value_ptr_init_offset + level_start_id * qid_stride_half2);
for (int p_col=0; p_col < num_point; ++p_col)
{
// 采样坐标和权重依然是单个的 __half,这里强转为 float
const float loc_w = __half2float(data_sampling_loc[data_loc_w_ptr]);
const float loc_h = __half2float(data_sampling_loc[data_loc_w_ptr + 1]);
const float weight = __half2float(data_attn_weight[data_weight_ptr]);
const float h_im = loc_h * spatial_h - 0.5f;
const float w_im = loc_w * spatial_w - 0.5f;
if (h_im > -1.0f && w_im > -1.0f && h_im < spatial_h && w_im < spatial_w)
{
float2 val = ms_deform_attn_im2col_bilinear_half2(data_value_ptr, spatial_h, spatial_w, num_heads, channels_half2, h_im, w_im, m_col, c_col_half2);
// 一次乘加,同时更新 2 个 channel!
col.x += val.x * weight;
col.y += val.y * weight;
}
data_weight_ptr += 1;
data_loc_w_ptr += 2;
}
}
// 最后把高精度的 float2 打包回 half2 塞入显存
*data_col_ptr = __float22half2_rn(col);
}
}
// ============================================================================
// 启动 Launcher
// ============================================================================
void ms_deformable_im2col_cuda_half2(hipStream_t stream,
const __half* data_value,
const int64_t* data_spatial_shapes,
const int64_t* data_level_start_index,
const __half* data_sampling_loc,
const __half* data_attn_weight,
const int batch_size,
const int spatial_size,
const int num_heads,
const int channels,
const int num_levels,
const int num_query,
const int num_point,
__half* data_col)
{
// 【绝杀】总线程数直接砍半,极大缓解 DCU 的线程调度压力
const int num_kernels = batch_size * num_query * num_heads * (channels / 2);
const int num_threads = CUDA_NUM_THREADS;
// 巧妙地将 __half 指针强制转换为 half2* 传入,实现合并读写
ms_deformable_im2col_gpu_kernel_half2<<<GET_BLOCKS(num_kernels, num_threads), num_threads, 0, stream>>>(
num_kernels, (const half2*)data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight,
batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, (half2*)data_col);
}
\ No newline at end of file
# torch
# torchvision
transformers
addict
yapf
timm
# numpy
opencv-python
supervision>=0.22.0
pycocotools
\ No newline at end of file
# -*- coding: utf-8 -*-
import cv2
import numpy as np
import migraphx
import time
import os
def ReadImage(pathOfImage, inputShape):
"""
读取并预处理图像,转换为模型输入要求的NCHW格式
"""
srcImage = cv2.imread(pathOfImage, cv2.IMREAD_COLOR)
if srcImage is None:
raise ValueError(f"无法读取图像文件: {pathOfImage}")
resizedImage = cv2.resize(srcImage, (inputShape[3], inputShape[2]))
resizedImage_Float = resizedImage.astype("float32")
srcImage_CHW = np.transpose(resizedImage_Float, (2, 0, 1))
mean = np.array([127.5, 127.5, 127.5])
scale = np.array([0.0078125, 0.0078125, 0.0078125])
inputData = np.zeros(inputShape).astype("float32")
for i in range(srcImage_CHW.shape[0]):
inputData[0, i, :, :] = (srcImage_CHW[i, :, :] - mean[i]) * scale[i]
for i in range(inputData.shape[0]):
if i != 0:
inputData[i, :, :, :] = inputData[0, :, :, :]
return inputData
if __name__ == '__main__':
# ====================== 模型路径配置 ======================
onnx_path = "ResNet50.onnx"
mxr_path = "ResNet50_gpu.mxr"
# ====================== MIGraphX 加载 / 编译 / 保存 mxr ======================
print("🔍 加载ONNX模型")
if os.path.exists(mxr_path):
# ✅ 正确加载 mxr
model = migraphx.load(mxr_path)
print("✅ 从缓存加载编译好的模型 (mxr)")
else:
# ✅ 正确编译 + 保存 mxr(修复了 save 报错)
model = migraphx.parse_onnx(onnx_path)
model.compile(migraphx.get_target("gpu"), device_id=0)
migraphx.save(model, mxr_path)
print("✅ 模型编译完成,并已保存为 mxr")
print(f"✅ 模型加载完成 - 当前执行引擎: ['ROCMExecutionProvider', 'CPUExecutionProvider'] (MIGraphX GPU)")
# ====================== 获取输入输出信息 ======================
input_name = list(model.get_inputs().keys())[0]
input_shape = model.get_inputs()[input_name].lens()
output_name = list(model.get_outputs().keys())[0]
print(f"模型输入名称:{input_name}, 输入形状:{input_shape}")
print(f"模型输出名称:{output_name}")
# ====================== 图像预处理 ======================
pathOfImage = "../images/in/ImageNet_01.jpg"
image = ReadImage(pathOfImage, input_shape)
# ====================== 预热 3 次 ======================
for i in range(3):
outputs = model.run({input_name: image})
# ====================== 推理测速 10 次 ======================
for i in range(10):
start_time = time.time()
outputs = model.run({input_name: image})
end_time = time.time()
print(f"推理时间: {(end_time - start_time) * 1000:.2f} ms")
\ No newline at end of file
# -*- coding: utf-8 -*-
import cv2
import numpy as np
import migraphx
import time
import os
def ReadImage(pathOfImage, inputShape):
"""
读取并预处理图像,转换为模型输入要求的NCHW格式
"""
srcImage = cv2.imread(pathOfImage, cv2.IMREAD_COLOR)
if srcImage is None:
raise ValueError(f"无法读取图像文件: {pathOfImage}")
resizedImage = cv2.resize(srcImage, (inputShape[3], inputShape[2]))
resizedImage_Float = resizedImage.astype("float32")
srcImage_CHW = np.transpose(resizedImage_Float, (2, 0, 1))
mean = np.array([127.5, 127.5, 127.5])
scale = np.array([0.0078125, 0.0078125, 0.0078125])
inputData = np.zeros(inputShape).astype("float32")
for i in range(srcImage_CHW.shape[0]):
inputData[0, i, :, :] = (srcImage_CHW[i, :, :] - mean[i]) * scale[i]
for i in range(inputData.shape[0]):
if i != 0:
inputData[i, :, :, :] = inputData[0, :, :, :]
return inputData
def AllocateOutputMemory(model):
outputData={}
for key in model.get_outputs().keys():
outputData[key] = migraphx.allocate_gpu(s=model.get_outputs()[key])
return outputData
if __name__ == '__main__':
# ====================== 模型路径配置 ======================
onnx_path = "ResNet50.onnx"
mxr_path = "ResNet50_gpu.mxr"
# ====================== MIGraphX 加载 / 编译 / 保存 mxr ======================
print("🔍 加载ONNX模型")
if os.path.exists(mxr_path):
# ✅ 正确加载 mxr
model = migraphx.load(mxr_path)
print("✅ 从缓存加载编译好的模型 (mxr)")
else:
# ✅ 正确编译 + 保存 mxr(修复了 save 报错)
model = migraphx.parse_onnx(onnx_path)
model.compile(migraphx.get_target("gpu"), offload_copy=False, device_id=0)
# 👇 这里是正确写法!不是 model.save()
migraphx.save(model, mxr_path)
print("✅ 模型编译完成,并已保存为 mxr")
print(f"✅ 模型加载完成 - 当前执行引擎: ['ROCMExecutionProvider', 'CPUExecutionProvider'] (MIGraphX GPU)")
# ====================== 获取输入输出信息 ======================
input_name = list(model.get_inputs().keys())[0]
input_shape = model.get_inputs()[input_name].lens()
output_name = list(model.get_outputs().keys())[0]
print(f"模型输入名称:{input_name}, 输入形状:{input_shape}")
print(f"模型输出名称:{output_name}")
# 为输出节点分配device内存,用于保存输出数据
modelData=AllocateOutputMemory(model)
# ====================== 图像预处理 ======================
pathOfImage = "../images/in/ImageNet_01.jpg"
image = ReadImage(pathOfImage, input_shape)
# 将输入数据转换为device数据作为输入数据
modelData[input_name]=migraphx.to_gpu(migraphx.argument(image))
# ====================== 预热 3 次 ======================
for i in range(3):
outputs = model.run({input_name: image})
# ====================== 推理测速 10 次 ======================
for i in range(10):
start_time = time.time()
outputs = model.run({input_name: image})
end_time = time.time()
print(f"推理时间: {(end_time - start_time) * 1000:.2f} ms")
\ No newline at end of file
# -*- coding: utf-8 -*-
import cv2
import numpy as np
import migraphx
def ReadImage(pathOfImage, inputShape):
"""
读取并预处理图像,转换为模型输入要求的NCHW格式
Args:
pathOfImage: 图像文件路径
inputShape: 模型输入形状 (N, C, H, W)
Returns:
预处理后的图像数据,NCHW格式,float32类型
"""
# 读取彩色图像
srcImage = cv2.imread(pathOfImage, cv2.IMREAD_COLOR)
if srcImage is None:
raise ValueError(f"无法读取图像文件: {pathOfImage}")
# resize到模型要求的尺寸 (W, H)
resizedImage = cv2.resize(srcImage, (inputShape[3], inputShape[2]))
# 转换为float32类型
resizedImage_Float = resizedImage.astype("float32")
# HWC -> CHW
srcImage_CHW = np.transpose(resizedImage_Float, (2, 0, 1))
# 预处理:减均值,乘缩放因子
mean = np.array([127.5, 127.5, 127.5])
scale = np.array([0.0078125, 0.0078125, 0.0078125])
# 创建NCHW格式的输入数据
inputData = np.zeros(inputShape).astype("float32")
# 对每个通道进行预处理
for i in range(srcImage_CHW.shape[0]):
inputData[0, i, :, :] = (srcImage_CHW[i, :, :] - mean[i]) * scale[i]
# 如果batch维度大于1,复制第一份数据填充(仅用于示例)
for i in range(inputData.shape[0]):
if i != 0:
inputData[i, :, :, :] = inputData[0, :, :, :]
return inputData
if __name__ == '__main__':
# ====================== 1. 加载ONNX模型 ======================
# try:
# model = migraphx.parse_onnx("ResNet50.onnx")
# except Exception as e:
# raise RuntimeError(f"加载模型失败: {e}")
# 加载ONNX模型(启用优化)
print("🔍 加载ONNX模型")
import onnxruntime as ort
model_path = 'ResNet50.onnx'
sess_options = ort.SessionOptions()
sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL # 启用所有图优化
sess_options.log_severity_level = 3 # 减少日志输出
# sess_options.enable_profiling = True # 启用性能分析
ort_session = ort.InferenceSession(model_path,
sess_options=sess_options,
providers=['CPUExecutionProvider, ROCMExecutionProvider'])
# 查看当前执行引擎
current_provider = ort_session.get_providers()
print(f"✅ 模型加载完成 - 当前执行引擎: {current_provider}")
input_name = ort_session.get_inputs()[0].name
input_shape = ort_session.get_inputs()[0].shape
output_name = ort_session.get_outputs()[0].name
print(f"模型输入名称:{input_name}, 输入形状:{input_shape}")
print(f"模型输出名称:{output_name}")
# ====================== 4. 图像预处理 ======================
pathOfImage = "../images/in/ImageNet_01.jpg"
try:
image = ReadImage(pathOfImage, input_shape)
except Exception as e:
raise RuntimeError(f"图像预处理失败: {e}")
# outputs = ort_session.run({input_name: image})
for i in range(3): # 示例:运行10次
outputs = ort_session.run([output_name], {input_name: image})
import time
for i in range(10): # 示例:运行10次
start_time = time.time()
outputs = ort_session.run([output_name], {input_name: image})
end_time = time.time()
print(f"推理时间: {(end_time - start_time) * 1000:.2f} ms")
\ No newline at end of file
import torch
import torchvision
# Pytorch模型文件
pathOfPytorchModel = "resnet50-19c8e357.pth"
# 创建ResNet50模型
net = torchvision.models.resnet50(pretrained=False)
# 定义输入
input = torch.randn(32,3,224,224)
# 生成的ONNX模型的路径
pathOfONNX = "ResNet50.onnx"
net.load_state_dict(torch.load(pathOfPytorchModel))
net.eval()
# 导出ONNX模型
torch.onnx.export(net,input,pathOfONNX,input_names = ["input"])
[
{"cat" : "Session","pid" :1773812,"tid" :1773812,"dur" :91000,"ts" :20,"ph" : "X","name" :"model_loading_uri","args" : {}},
{"cat" : "Session","pid" :1773812,"tid" :1773812,"dur" :4754984,"ts" :91208,"ph" : "X","name" :"session_initialization","args" : {}}
]
[
{"cat" : "Session","pid" :1774715,"tid" :1774715,"dur" :73864,"ts" :6,"ph" : "X","name" :"model_loading_uri","args" : {}},
{"cat" : "Session","pid" :1774715,"tid" :1774715,"dur" :4308806,"ts" :73985,"ph" : "X","name" :"session_initialization","args" : {}}
]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment