Hipify various dependencies to enable AMD Face Enhancer

Summary: Pull Request resolved: https://github.com/facebookresearch/d2go/pull/675 This diff extends several targets to be hip compatible and fixes a few silly hipification issues with those targets. After these changes, all dependencies needed for the face enhancer can compile with AMD. A few silly issues that I had to hack around, maybe we could improve hipification to avoid similar issues in the future: * Some of the dependencies used sources in `src/cuda/**.cu`. Hipification tried to rename "cuda" to "hip" and broke the paths. I'm not sure where that rename happens so I just changed the directory from "cuda" to "gpu" to avoid the issue. * One header import called `THCAtomics.cuh` was incorrectly being renamed to `THHAtomics.cuh`, which doesnt exist. Fortunately an equivalent import that doesnt have name issues was available. We also might want to consider graduating the cpp_library_hip bazel helper out of fbgemm since it seems pretty generally useful. For some of the targets, we needed to build a python cpp extension, which as far as I can tell we didnt have good hipification for yet. I added a new buck rule very similar to our standard cpp_library_hip rule that creates an extension instead. It's a little copy-pasted so if there are cleaner ways to work around this requirement let me know. Reviewed By: houseroad Differential Revision: D61080247 fbshipit-source-id: dc6f101eb3eadfd43ef5610c651b1639e4c78ae6

Hipify various dependencies to enable AMD Face Enhancer
Summary: Pull Request resolved: https://github.com/facebookresearch/d2go/pull/675 This diff extends several targets to be hip compatible and fixes a few silly hipification issues with those targets. After these changes, all dependencies needed for the face enhancer can compile with AMD. A few silly issues that I had to hack around, maybe we could improve hipification to avoid similar issues in the future: * Some of the dependencies used sources in `src/cuda/**.cu`. Hipification tried to rename "cuda" to "hip" and broke the paths. I'm not sure where that rename happens so I just changed the directory from "cuda" to "gpu" to avoid the issue. * One header import called `THCAtomics.cuh` was incorrectly being renamed to `THHAtomics.cuh`, which doesnt exist. Fortunately an equivalent import that doesnt have name issues was available. We also might want to consider graduating the cpp_library_hip bazel helper out of fbgemm since it seems pretty generally useful. For some of the targets, we needed to build a python cpp extension, which as far as I can tell we didnt have good hipification for yet. I added a new buck rule very similar to our standard cpp_library_hip rule that creates an extension instead. It's a little copy-pasted so if there are cleaner ways to work around this requirement let me know. Reviewed By: houseroad Differential Revision: D61080247 fbshipit-source-id: dc6f101eb3eadfd43ef5610c651b1639e4c78ae6
7739077a · Josh Fromm · Facebook GitHub Bot · e09224b8 · 7739077a · 7739077a
Commit 7739077a authored Aug 13, 2024 by Josh Fromm Committed by Facebook GitHub Bot Aug 13, 2024
4 changed files
--- a/projects_oss/detr/detr/src/cuda/ms_deform_attn_cuda.cu
+++ b/projects_oss/detr/detr/src/cuda/ms_deform_attn_cuda.cu
@@ -9,7 +9,7 @@
 */
 #include <vector>
-#include "cuda/ms_deform_im2col_cuda.cuh"
+#include "ms_deform_im2col_cuda.cuh"
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
@@ -18,7 +18,7 @@
 at::Tensor ms_deform_attn_cuda_forward(
-    const at::Tensor &value, 
+    const at::Tensor &value,
    const at::Tensor &spatial_shapes,
    const at::Tensor &level_start_index,
    const at::Tensor &sampling_loc,
@@ -50,7 +50,7 @@ at::Tensor ms_deform_attn_cuda_forward(
    const int im2col_step_ = std::min(batch, im2col_step);
    AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
    auto output = at::zeros({batch, num_query, num_heads, channels}, value.options());
    const int batch_n = im2col_step_;
@@ -81,7 +81,7 @@ at::Tensor ms_deform_attn_cuda_forward(
 std::vector<at::Tensor> ms_deform_attn_cuda_backward(
-    const at::Tensor &value, 
+    const at::Tensor &value,
    const at::Tensor &spatial_shapes,
    const at::Tensor &level_start_index,
    const at::Tensor &sampling_loc,
@@ -127,7 +127,7 @@ std::vector<at::Tensor> ms_deform_attn_cuda_backward(
    auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
    auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
    auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
    for (int n = 0; n < batch/im2col_step_; ++n)
    {
        auto grad_output_g = grad_output_n.select(0, n);
@@ -150,4 +150,4 @@ std::vector<at::Tensor> ms_deform_attn_cuda_backward(
    return {
        grad_value, grad_sampling_loc, grad_attn_weight
    };
 }
\ No newline at end of file
--- a/projects_oss/detr/detr/src/cuda/ms_deform_attn_cuda.h
+++ b/projects_oss/detr/detr/src/cuda/ms_deform_attn_cuda.h
--- a/projects_oss/detr/detr/src/cuda/ms_deform_im2col_cuda.cuh
+++ b/projects_oss/detr/detr/src/cuda/ms_deform_im2col_cuda.cuh
@@ -15,8 +15,7 @@
 #include <ATen/ATen.h>
 #include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Atomic.cuh>
-#include <THC/THCAtomics.cuh>
 #define CUDA_KERNEL_LOOP(i, n)                          \
  for (int i = blockIdx.x * blockDim.x + threadIdx.x;   \
@@ -31,7 +30,7 @@ inline int GET_BLOCKS(const int N, const int num_threads)
 template <typename scalar_t>
-__device__ scalar_t ms_deform_attn_im2col_bilinear(const scalar_t* &bottom_data, 
+__device__ scalar_t ms_deform_attn_im2col_bilinear(const scalar_t* &bottom_data,
                                                   const int &height, const int &width, const int &nheads, const int &channels,
                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c)
 {
@@ -85,12 +84,12 @@ __device__ scalar_t ms_deform_attn_im2col_bilinear(const scalar_t* &bottom_data,
 template <typename scalar_t>
-__device__ void ms_deform_attn_col2im_bilinear(const scalar_t* &bottom_data, 
+__device__ void ms_deform_attn_col2im_bilinear(const scalar_t* &bottom_data,
                                                   const int &height, const int &width, const int &nheads, const int &channels,
                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c,
                                                   const scalar_t &top_grad,
                                                   const scalar_t &attn_weight,
-                                                   scalar_t* &grad_value, 
+                                                   scalar_t* &grad_value,
                                                   scalar_t* grad_sampling_loc,
                                                   scalar_t* grad_attn_weight)
 {
@@ -140,7 +139,7 @@ __device__ void ms_deform_attn_col2im_bilinear(const scalar_t* &bottom_data,
    v3 = bottom_data[ptr3];
    grad_h_weight += hw * v3;
    grad_w_weight -= lh * v3;
-    atomicAdd(grad_value+ptr3, w3*top_grad_value); 
+    atomicAdd(grad_value+ptr3, w3*top_grad_value);
  }
  scalar_t v4 = 0;
  if (h_high <= height - 1 && w_high <= width - 1)
@@ -160,12 +159,12 @@ __device__ void ms_deform_attn_col2im_bilinear(const scalar_t* &bottom_data,
 template <typename scalar_t>
-__device__ void ms_deform_attn_col2im_bilinear_gm(const scalar_t* &bottom_data, 
+__device__ void ms_deform_attn_col2im_bilinear_gm(const scalar_t* &bottom_data,
                                                   const int &height, const int &width, const int &nheads, const int &channels,
                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c,
                                                   const scalar_t &top_grad,
                                                   const scalar_t &attn_weight,
-                                                   scalar_t* &grad_value, 
+                                                   scalar_t* &grad_value,
                                                   scalar_t* grad_sampling_loc,
                                                   scalar_t* grad_attn_weight)
 {
@@ -215,7 +214,7 @@ __device__ void ms_deform_attn_col2im_bilinear_gm(const scalar_t* &bottom_data,
    v3 = bottom_data[ptr3];
    grad_h_weight += hw * v3;
    grad_w_weight -= lh * v3;
-    atomicAdd(grad_value+ptr3, w3*top_grad_value); 
+    atomicAdd(grad_value+ptr3, w3*top_grad_value);
  }
  scalar_t v4 = 0;
  if (h_high <= height - 1 && w_high <= width - 1)
@@ -228,7 +227,7 @@ __device__ void ms_deform_attn_col2im_bilinear_gm(const scalar_t* &bottom_data,
  }
  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-  atomicAdd(grad_attn_weight, top_grad * val); 
+  atomicAdd(grad_attn_weight, top_grad * val);
  atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value);
  atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value);
 }
@@ -236,15 +235,15 @@ __device__ void ms_deform_attn_col2im_bilinear_gm(const scalar_t* &bottom_data,
 template <typename scalar_t>
 __global__ void ms_deformable_im2col_gpu_kernel(const int n,
-                                                const scalar_t *data_value, 
+                                                const scalar_t *data_value,
                                                const int64_t *data_spatial_shapes,
-                                                const int64_t *data_level_start_index, 
+                                                const int64_t *data_level_start_index,
                                                const scalar_t *data_sampling_loc,
                                                const scalar_t *data_attn_weight,
-                                                const int batch_size, 
+                                                const int batch_size,
-                                                const int spatial_size, 
+                                                const int spatial_size,
                                                const int num_heads,
-                                                const int channels, 
+                                                const int channels,
                                                const int num_levels,
                                                const int num_query,
                                                const int num_point,
@@ -255,7 +254,7 @@ __global__ void ms_deformable_im2col_gpu_kernel(const int n,
    int _temp = index;
    const int c_col = _temp % channels;
    _temp /= channels;
-    const int sampling_index = _temp; 
+    const int sampling_index = _temp;
    const int m_col = _temp % num_heads;
    _temp /= num_heads;
    const int q_col = _temp % num_query;
@@ -268,7 +267,7 @@ __global__ void ms_deformable_im2col_gpu_kernel(const int n,
    const int qid_stride = num_heads * channels;
    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
    scalar_t col = 0;
    for (int l_col=0; l_col < num_levels; ++l_col)
    {
      const int level_start_id = data_level_start_index[l_col];
@@ -303,13 +302,13 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(co
                                                const scalar_t *grad_col,
                                                const scalar_t *data_value,
                                                const int64_t *data_spatial_shapes,
-                                                const int64_t *data_level_start_index, 
+                                                const int64_t *data_level_start_index,
                                                const scalar_t *data_sampling_loc,
                                                const scalar_t *data_attn_weight,
-                                                const int batch_size, 
+                                                const int batch_size,
-                                                const int spatial_size, 
+                                                const int spatial_size,
                                                const int num_heads,
-                                                const int channels, 
+                                                const int channels,
                                                const int num_levels,
                                                const int num_query,
                                                const int num_point,
@@ -325,7 +324,7 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(co
    int _temp = index;
    const int c_col = _temp % channels;
    _temp /= channels;
-    const int sampling_index = _temp; 
+    const int sampling_index = _temp;
    const int m_col = _temp % num_heads;
    _temp /= num_heads;
    const int q_col = _temp % num_query;
@@ -369,10 +368,10 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(co
        {
          ms_deform_attn_col2im_bilinear(
            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
-            top_grad, weight, grad_value_ptr, 
+            top_grad, weight, grad_value_ptr,
            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
        }
        __syncthreads();
        if (tid == 0)
        {
@@ -385,8 +384,8 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(co
            _grad_a += cache_grad_attn_weight[tid];
            sid += 2;
          }
          *grad_sampling_loc = _grad_w;
          *(grad_sampling_loc + 1) = _grad_h;
          *grad_attn_weight = _grad_a;
@@ -408,13 +407,13 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(co
                                                const scalar_t *grad_col,
                                                const scalar_t *data_value,
                                                const int64_t *data_spatial_shapes,
-                                                const int64_t *data_level_start_index, 
+                                                const int64_t *data_level_start_index,
                                                const scalar_t *data_sampling_loc,
                                                const scalar_t *data_attn_weight,
-                                                const int batch_size, 
+                                                const int batch_size,
-                                                const int spatial_size, 
+                                                const int spatial_size,
                                                const int num_heads,
-                                                const int channels, 
+                                                const int channels,
                                                const int num_levels,
                                                const int num_query,
                                                const int num_point,
@@ -430,7 +429,7 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(co
    int _temp = index;
    const int c_col = _temp % channels;
    _temp /= channels;
-    const int sampling_index = _temp; 
+    const int sampling_index = _temp;
    const int m_col = _temp % num_heads;
    _temp /= num_heads;
    const int q_col = _temp % num_query;
@@ -474,10 +473,10 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(co
        {
          ms_deform_attn_col2im_bilinear(
            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
-            top_grad, weight, grad_value_ptr, 
+            top_grad, weight, grad_value_ptr,
            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
        }
        __syncthreads();
        for (unsigned int s=blockSize/2; s>0; s>>=1)
@@ -493,7 +492,7 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(co
        }
        if (tid == 0)
-        { 
+        {
          *grad_sampling_loc = cache_grad_sampling_loc[0];
          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
          *grad_attn_weight = cache_grad_attn_weight[0];
@@ -515,13 +514,13 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(const int n,
                                                const scalar_t *grad_col,
                                                const scalar_t *data_value,
                                                const int64_t *data_spatial_shapes,
-                                                const int64_t *data_level_start_index, 
+                                                const int64_t *data_level_start_index,
                                                const scalar_t *data_sampling_loc,
                                                const scalar_t *data_attn_weight,
-                                                const int batch_size, 
+                                                const int batch_size,
-                                                const int spatial_size, 
+                                                const int spatial_size,
                                                const int num_heads,
-                                                const int channels, 
+                                                const int channels,
                                                const int num_levels,
                                                const int num_query,
                                                const int num_point,
@@ -538,7 +537,7 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(const int n,
    int _temp = index;
    const int c_col = _temp % channels;
    _temp /= channels;
-    const int sampling_index = _temp; 
+    const int sampling_index = _temp;
    const int m_col = _temp % num_heads;
    _temp /= num_heads;
    const int q_col = _temp % num_query;
@@ -582,10 +581,10 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(const int n,
        {
          ms_deform_attn_col2im_bilinear(
            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
-            top_grad, weight, grad_value_ptr, 
+            top_grad, weight, grad_value_ptr,
            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
        }
        __syncthreads();
        if (tid == 0)
        {
@@ -598,8 +597,8 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(const int n,
            _grad_a += cache_grad_attn_weight[tid];
            sid += 2;
          }
          *grad_sampling_loc = _grad_w;
          *(grad_sampling_loc + 1) = _grad_h;
          *grad_attn_weight = _grad_a;
@@ -620,13 +619,13 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(const int n,
                                                const scalar_t *grad_col,
                                                const scalar_t *data_value,
                                                const int64_t *data_spatial_shapes,
-                                                const int64_t *data_level_start_index, 
+                                                const int64_t *data_level_start_index,
                                                const scalar_t *data_sampling_loc,
                                                const scalar_t *data_attn_weight,
-                                                const int batch_size, 
+                                                const int batch_size,
-                                                const int spatial_size, 
+                                                const int spatial_size,
                                                const int num_heads,
-                                                const int channels, 
+                                                const int channels,
                                                const int num_levels,
                                                const int num_query,
                                                const int num_point,
@@ -643,7 +642,7 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(const int n,
    int _temp = index;
    const int c_col = _temp % channels;
    _temp /= channels;
-    const int sampling_index = _temp; 
+    const int sampling_index = _temp;
    const int m_col = _temp % num_heads;
    _temp /= num_heads;
    const int q_col = _temp % num_query;
@@ -687,10 +686,10 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(const int n,
        {
          ms_deform_attn_col2im_bilinear(
            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
-            top_grad, weight, grad_value_ptr, 
+            top_grad, weight, grad_value_ptr,
            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
        }
        __syncthreads();
        for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1)
@@ -706,7 +705,7 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(const int n,
              cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)];
              cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)];
              cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
-            } 
+            }
          }
          __syncthreads();
        }
@@ -733,13 +732,13 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(const
                                                const scalar_t *grad_col,
                                                const scalar_t *data_value,
                                                const int64_t *data_spatial_shapes,
-                                                const int64_t *data_level_start_index, 
+                                                const int64_t *data_level_start_index,
                                                const scalar_t *data_sampling_loc,
                                                const scalar_t *data_attn_weight,
-                                                const int batch_size, 
+                                                const int batch_size,
-                                                const int spatial_size, 
+                                                const int spatial_size,
                                                const int num_heads,
-                                                const int channels, 
+                                                const int channels,
                                                const int num_levels,
                                                const int num_query,
                                                const int num_point,
@@ -756,7 +755,7 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(const
    int _temp = index;
    const int c_col = _temp % channels;
    _temp /= channels;
-    const int sampling_index = _temp; 
+    const int sampling_index = _temp;
    const int m_col = _temp % num_heads;
    _temp /= num_heads;
    const int q_col = _temp % num_query;
@@ -800,10 +799,10 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(const
        {
          ms_deform_attn_col2im_bilinear(
            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
-            top_grad, weight, grad_value_ptr, 
+            top_grad, weight, grad_value_ptr,
            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
        }
        __syncthreads();
        for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1)
@@ -847,13 +846,13 @@ __global__ void ms_deformable_col2im_gpu_kernel_gm(const int n,
                                                const scalar_t *grad_col,
                                                const scalar_t *data_value,
                                                const int64_t *data_spatial_shapes,
-                                                const int64_t *data_level_start_index, 
+                                                const int64_t *data_level_start_index,
                                                const scalar_t *data_sampling_loc,
                                                const scalar_t *data_attn_weight,
-                                                const int batch_size, 
+                                                const int batch_size,
-                                                const int spatial_size, 
+                                                const int spatial_size,
                                                const int num_heads,
-                                                const int channels, 
+                                                const int channels,
                                                const int num_levels,
                                                const int num_query,
                                                const int num_point,
@@ -866,7 +865,7 @@ __global__ void ms_deformable_col2im_gpu_kernel_gm(const int n,
    int _temp = index;
    const int c_col = _temp % channels;
    _temp /= channels;
-    const int sampling_index = _temp; 
+    const int sampling_index = _temp;
    const int m_col = _temp % num_heads;
    _temp /= num_heads;
    const int q_col = _temp % num_query;
@@ -907,7 +906,7 @@ __global__ void ms_deformable_col2im_gpu_kernel_gm(const int n,
        {
          ms_deform_attn_col2im_bilinear_gm(
            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
-            top_grad, weight, grad_value_ptr, 
+            top_grad, weight, grad_value_ptr,
            grad_sampling_loc, grad_attn_weight);
        }
        data_weight_ptr += 1;
@@ -923,15 +922,15 @@ __global__ void ms_deformable_col2im_gpu_kernel_gm(const int n,
 template <typename scalar_t>
 void ms_deformable_im2col_cuda(cudaStream_t stream,
                              const scalar_t* data_value,
-                              const int64_t* data_spatial_shapes, 
+                              const int64_t* data_spatial_shapes,
-                              const int64_t* data_level_start_index, 
+                              const int64_t* data_level_start_index,
                              const scalar_t* data_sampling_loc,
                              const scalar_t* data_attn_weight,
                              const int batch_size,
-                              const int spatial_size, 
+                              const int spatial_size,
-                              const int num_heads, 
+                              const int num_heads,
-                              const int channels, 
+                              const int channels,
-                              const int num_levels, 
+                              const int num_levels,
                              const int num_query,
                              const int num_point,
                              scalar_t* data_col)
@@ -942,9 +941,9 @@ void ms_deformable_im2col_cuda(cudaStream_t stream,
  ms_deformable_im2col_gpu_kernel<scalar_t>
      <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
          0, stream>>>(
-      num_kernels, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, 
+      num_kernels, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight,
      batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, data_col);
  cudaError_t err = cudaGetLastError();
  if (err != cudaSuccess)
  {
@@ -961,13 +960,13 @@ void ms_deformable_col2im_cuda(cudaStream_t stream,
                              const int64_t * data_level_start_index,
                              const scalar_t * data_sampling_loc,
                              const scalar_t * data_attn_weight,
-                              const int batch_size, 
+                              const int batch_size,
-                              const int spatial_size, 
+                              const int spatial_size,
                              const int num_heads,
-                              const int channels, 
+                              const int channels,
                              const int num_levels,
                              const int num_query,
-                              const int num_point, 
+                              const int num_point,
                              scalar_t* grad_value,
                              scalar_t* grad_sampling_loc,
                              scalar_t* grad_attn_weight)
@@ -982,17 +981,17 @@ void ms_deformable_col2im_cuda(cudaStream_t stream,
      ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks<scalar_t>
          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
              num_threads*3*sizeof(scalar_t), stream>>>(
-                        num_kernels, 
+                        num_kernels,
                        grad_col,
                        data_value,
                        data_spatial_shapes,
-                        data_level_start_index, 
+                        data_level_start_index,
                        data_sampling_loc,
                        data_attn_weight,
-                        batch_size, 
+                        batch_size,
-                        spatial_size, 
+                        spatial_size,
                        num_heads,
-                        channels, 
+                        channels,
                        num_levels,
                        num_query,
                        num_point,
@@ -1005,17 +1004,17 @@ void ms_deformable_col2im_cuda(cudaStream_t stream,
      ms_deformable_col2im_gpu_kernel_gm<scalar_t>
        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
            0, stream>>>(
-                      num_kernels, 
+                      num_kernels,
                      grad_col,
                      data_value,
                      data_spatial_shapes,
-                      data_level_start_index, 
+                      data_level_start_index,
                      data_sampling_loc,
                      data_attn_weight,
-                      batch_size, 
+                      batch_size,
-                      spatial_size, 
+                      spatial_size,
                      num_heads,
-                      channels, 
+                      channels,
                      num_levels,
                      num_query,
                      num_point,
@@ -1031,17 +1030,17 @@ void ms_deformable_col2im_cuda(cudaStream_t stream,
        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 1>
        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
            0, stream>>>(
-                      num_kernels, 
+                      num_kernels,
                      grad_col,
                      data_value,
                      data_spatial_shapes,
-                      data_level_start_index, 
+                      data_level_start_index,
                      data_sampling_loc,
                      data_attn_weight,
-                      batch_size, 
+                      batch_size,
-                      spatial_size, 
+                      spatial_size,
                      num_heads,
-                      channels, 
+                      channels,
                      num_levels,
                      num_query,
                      num_point,
@@ -1053,17 +1052,17 @@ void ms_deformable_col2im_cuda(cudaStream_t stream,
        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 2>
        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
            0, stream>>>(
-                      num_kernels, 
+                      num_kernels,
                      grad_col,
                      data_value,
                      data_spatial_shapes,
-                      data_level_start_index, 
+                      data_level_start_index,
                      data_sampling_loc,
                      data_attn_weight,
-                      batch_size, 
+                      batch_size,
-                      spatial_size, 
+                      spatial_size,
                      num_heads,
-                      channels, 
+                      channels,
                      num_levels,
                      num_query,
                      num_point,
@@ -1075,17 +1074,17 @@ void ms_deformable_col2im_cuda(cudaStream_t stream,
        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 4>
        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
            0, stream>>>(
-                      num_kernels, 
+                      num_kernels,
                      grad_col,
                      data_value,
                      data_spatial_shapes,
-                      data_level_start_index, 
+                      data_level_start_index,
                      data_sampling_loc,
                      data_attn_weight,
-                      batch_size, 
+                      batch_size,
-                      spatial_size, 
+                      spatial_size,
                      num_heads,
-                      channels, 
+                      channels,
                      num_levels,
                      num_query,
                      num_point,
@@ -1097,17 +1096,17 @@ void ms_deformable_col2im_cuda(cudaStream_t stream,
        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 8>
        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
            0, stream>>>(
-                      num_kernels, 
+                      num_kernels,
                      grad_col,
                      data_value,
                      data_spatial_shapes,
-                      data_level_start_index, 
+                      data_level_start_index,
                      data_sampling_loc,
                      data_attn_weight,
-                      batch_size, 
+                      batch_size,
-                      spatial_size, 
+                      spatial_size,
                      num_heads,
-                      channels, 
+                      channels,
                      num_levels,
                      num_query,
                      num_point,
@@ -1119,17 +1118,17 @@ void ms_deformable_col2im_cuda(cudaStream_t stream,
        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 16>
        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
            0, stream>>>(
-                      num_kernels, 
+                      num_kernels,
                      grad_col,
                      data_value,
                      data_spatial_shapes,
-                      data_level_start_index, 
+                      data_level_start_index,
                      data_sampling_loc,
                      data_attn_weight,
-                      batch_size, 
+                      batch_size,
-                      spatial_size, 
+                      spatial_size,
                      num_heads,
-                      channels, 
+                      channels,
                      num_levels,
                      num_query,
                      num_point,
@@ -1141,17 +1140,17 @@ void ms_deformable_col2im_cuda(cudaStream_t stream,
        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 32>
        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
            0, stream>>>(
-                      num_kernels, 
+                      num_kernels,
                      grad_col,
                      data_value,
                      data_spatial_shapes,
-                      data_level_start_index, 
+                      data_level_start_index,
                      data_sampling_loc,
                      data_attn_weight,
-                      batch_size, 
+                      batch_size,
-                      spatial_size, 
+                      spatial_size,
                      num_heads,
-                      channels, 
+                      channels,
                      num_levels,
                      num_query,
                      num_point,
@@ -1163,17 +1162,17 @@ void ms_deformable_col2im_cuda(cudaStream_t stream,
        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 64>
        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
            0, stream>>>(
-                      num_kernels, 
+                      num_kernels,
                      grad_col,
                      data_value,
                      data_spatial_shapes,
-                      data_level_start_index, 
+                      data_level_start_index,
                      data_sampling_loc,
                      data_attn_weight,
-                      batch_size, 
+                      batch_size,
-                      spatial_size, 
+                      spatial_size,
                      num_heads,
-                      channels, 
+                      channels,
                      num_levels,
                      num_query,
                      num_point,
@@ -1185,17 +1184,17 @@ void ms_deformable_col2im_cuda(cudaStream_t stream,
        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 128>
        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
            0, stream>>>(
-                      num_kernels, 
+                      num_kernels,
                      grad_col,
                      data_value,
                      data_spatial_shapes,
-                      data_level_start_index, 
+                      data_level_start_index,
                      data_sampling_loc,
                      data_attn_weight,
-                      batch_size, 
+                      batch_size,
-                      spatial_size, 
+                      spatial_size,
                      num_heads,
-                      channels, 
+                      channels,
                      num_levels,
                      num_query,
                      num_point,
@@ -1207,17 +1206,17 @@ void ms_deformable_col2im_cuda(cudaStream_t stream,
        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 256>
        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
            0, stream>>>(
-                      num_kernels, 
+                      num_kernels,
                      grad_col,
                      data_value,
                      data_spatial_shapes,
-                      data_level_start_index, 
+                      data_level_start_index,
                      data_sampling_loc,
                      data_attn_weight,
-                      batch_size, 
+                      batch_size,
-                      spatial_size, 
+                      spatial_size,
                      num_heads,
-                      channels, 
+                      channels,
                      num_levels,
                      num_query,
                      num_point,
@@ -1229,17 +1228,17 @@ void ms_deformable_col2im_cuda(cudaStream_t stream,
        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 512>
        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
            0, stream>>>(
-                      num_kernels, 
+                      num_kernels,
                      grad_col,
                      data_value,
                      data_spatial_shapes,
-                      data_level_start_index, 
+                      data_level_start_index,
                      data_sampling_loc,
                      data_attn_weight,
-                      batch_size, 
+                      batch_size,
-                      spatial_size, 
+                      spatial_size,
                      num_heads,
-                      channels, 
+                      channels,
                      num_levels,
                      num_query,
                      num_point,
@@ -1251,17 +1250,17 @@ void ms_deformable_col2im_cuda(cudaStream_t stream,
        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 1024>
        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
            0, stream>>>(
-                      num_kernels, 
+                      num_kernels,
                      grad_col,
                      data_value,
                      data_spatial_shapes,
-                      data_level_start_index, 
+                      data_level_start_index,
                      data_sampling_loc,
                      data_attn_weight,
-                      batch_size, 
+                      batch_size,
-                      spatial_size, 
+                      spatial_size,
                      num_heads,
-                      channels, 
+                      channels,
                      num_levels,
                      num_query,
                      num_point,
@@ -1275,17 +1274,17 @@ void ms_deformable_col2im_cuda(cudaStream_t stream,
          ms_deformable_col2im_gpu_kernel_shm_reduce_v1<scalar_t>
          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
              num_threads*3*sizeof(scalar_t), stream>>>(
-                        num_kernels, 
+                        num_kernels,
                        grad_col,
                        data_value,
                        data_spatial_shapes,
-                        data_level_start_index, 
+                        data_level_start_index,
                        data_sampling_loc,
                        data_attn_weight,
-                        batch_size, 
+                        batch_size,
-                        spatial_size, 
+                        spatial_size,
                        num_heads,
-                        channels, 
+                        channels,
                        num_levels,
                        num_query,
                        num_point,
@@ -1298,17 +1297,17 @@ void ms_deformable_col2im_cuda(cudaStream_t stream,
          ms_deformable_col2im_gpu_kernel_shm_reduce_v2<scalar_t>
          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
              num_threads*3*sizeof(scalar_t), stream>>>(
-                        num_kernels, 
+                        num_kernels,
                        grad_col,
                        data_value,
                        data_spatial_shapes,
-                        data_level_start_index, 
+                        data_level_start_index,
                        data_sampling_loc,
                        data_attn_weight,
-                        batch_size, 
+                        batch_size,
-                        spatial_size, 
+                        spatial_size,
                        num_heads,
-                        channels, 
+                        channels,
                        num_levels,
                        num_query,
                        num_point,
@@ -1324,4 +1323,4 @@ void ms_deformable_col2im_cuda(cudaStream_t stream,
    printf("error in ms_deformable_col2im_cuda: %s\n", cudaGetErrorString(err));
  }
 }
\ No newline at end of file
--- a/projects_oss/detr/detr/src/ms_deform_attn.h
+++ b/projects_oss/detr/detr/src/ms_deform_attn.h
@@ -13,7 +13,7 @@
 #include "cpu/ms_deform_attn_cpu.h"
 #ifdef WITH_CUDA
-#include "cuda/ms_deform_attn_cuda.h"
+#include "gpu/ms_deform_attn_cuda.h"
 #endif
@@ -59,4 +59,3 @@ ms_deform_attn_backward(
    }
    AT_ERROR("Not implemented on the CPU");
 }