init

5b17e272 · wangkx1 · 5b17e272 · 5b17e272 · 5b17e272 · 5b17e272
Commit 5b17e272 authored May 27, 2026 by wangkx1
7 changed files
--- a/DCNv4_op/src/cuda/flash_deform_attn_cuda.h
+++ b/DCNv4_op/src/cuda/flash_deform_attn_cuda.h
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from
+*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+#pragma once
+#include <torch/extension.h>
+at::Tensor flash_deform_attn_cuda_forward(
+    const at::Tensor &value, const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index, const at::Tensor &sampling_loc_attn,
+    const int im2col_step, const int K, const int d_stride, const int block_thread);
+std::vector<at::Tensor>
+flash_deform_attn_cuda_backward(
+    const at::Tensor &value, const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index, const at::Tensor &sampling_loc_attn,
+    const at::Tensor &grad_output, const int im2col_step, const int K,
+    const int d_stride, const int block_thread);
\ No newline at end of file
--- a/DCNv4_op/src/cuda/flash_deform_col2im_cuda.cuh
+++ b/DCNv4_op/src/cuda/flash_deform_col2im_cuda.cuh
+#include <algorithm>
+#include <cstdio>
+#include <cstring>
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <THC/THCAtomics.cuh>
+#include <ATen/ATen.h>
+#include <ATen/OpMathType.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cooperative_groups.h>
+// #include <cooperative_groups/memcpy_async.h>
+#include <cooperative_groups/memcpy_async.h>
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include "common.h"
+template <typename scalar_t, int d_stride, typename transfer_t, int L, int K>
+__global__ void
+backward_kernel(const scalar_t *p_value, const int64_t *data_spatial_shapes,
+                const int64_t *data_level_start_index, const scalar_t *p_offset,
+                const scalar_t *grad_output, const int N, const int G,
+                const int D, const int Q, 
+                const int block_multiplier, opmath_t *grad_im,
+                opmath_t *grad_offset) {
+  extern __shared__ char _s[];
+  const int &qi = (blockIdx.x * block_multiplier % Q) + threadIdx.z;
+  const int &bi = blockIdx.x * block_multiplier / Q;
+  const int &di_s = threadIdx.x * d_stride;
+  const int &gi = threadIdx.y;
+  opmath_t *cache_g_mask_before_softmax =
+      (opmath_t *)(_s); // (block_multiplier*G) * (L * K)
+  opmath_t *cache_grad_offset =
+      (opmath_t *)(cache_g_mask_before_softmax +
+                   block_multiplier * G * L *
+                       K); // (block_multiplier*G*D/d_stride*3)
+  opmath_t *const p_mask_shm =
+      ((opmath_t *)(cache_grad_offset +
+                    block_multiplier * G * D / d_stride * 3)) +
+      (threadIdx.z * G + gi) * L * K; // G*block_multiplier * L * K
+  const scalar_t *p_offset_ptr =
+      p_offset + (((bi * Q + qi) * G + gi) * L) * K * 3;
+  const int mask_length = L * K;
+  const int num_thread = (D / d_stride);
+  const int num_iter = mask_length / num_thread;
+  const int remainder = mask_length - num_iter * num_thread;
+  const scalar_t *top_grad = grad_output + ((bi * Q + qi) * G + gi) * D + di_s;
+  for (int i = 0; i < num_iter; i++) {
+    *(p_mask_shm + num_thread * i + threadIdx.x) =
+        *(scalar_t *)(p_offset_ptr + L * K * 2 + num_thread * i + threadIdx.x);
+  }
+  if (remainder > 0 && threadIdx.x < remainder) {
+    *(p_mask_shm + num_thread * num_iter + threadIdx.x) =
+        *(scalar_t *)(p_offset_ptr + L * K * 2 + num_thread * num_iter +
+                      threadIdx.x);
+  }
+  __syncthreads();
+  // Calculate softmax over L and K
+  if (threadIdx.x == 0) { // gi != 0, di = 0, li = 0
+    opmath_t softmax_max = -1e100;
+    opmath_t softmax_sum = 0.0;
+    // get max
+    for (int j = 0; j < L * K; j++) {
+      softmax_max = max(softmax_max, p_mask_shm[j]);
+    }
+    // get sumexp
+    for (int j = 0; j < L * K; j++) {
+      opmath_t exp_results = exp(p_mask_shm[j] - softmax_max);
+      p_mask_shm[j] = exp_results;
+      softmax_sum += exp_results;
+    }
+    // normalize
+    for (int j = 0; j < L * K; j++) {
+      p_mask_shm[j] /= softmax_sum;
+    }
+  }
+  __syncthreads();
+  int offset_idx = 0;
+  int mask_idx = 0;
+  const int w_stride = G * D;
+  const int base_ptr = gi * D + di_s;
+  for (int li = 0; li < L; li++) {
+    const int spatial_h = data_spatial_shapes[li * 2];
+    const int spatial_w = data_spatial_shapes[li * 2 + 1];
+    const int level_start_id = data_level_start_index[li];
+    const scalar_t *p_value_ptr = p_value + (bi * N + level_start_id) * G * D;
+    opmath_t *grad_im_ptr = grad_im + (bi * N + level_start_id) * G * D;
+    int cache_grad_off_idx =
+        ((threadIdx.z * G + threadIdx.y) * blockDim.x + threadIdx.x) * 3;
+    for (int ki = 0; ki < K; ki++) {
+      const opmath_t loc_w = p_offset_ptr[offset_idx];
+      const opmath_t loc_h = p_offset_ptr[offset_idx + 1];
+      const opmath_t attn = p_mask_shm[mask_idx];
+      const opmath_t h_im = loc_h * spatial_h - 0.5;
+      const opmath_t w_im = loc_w * spatial_w - 0.5;
+      // for cache_grad_offset (mG) x D/d x 3
+      cache_grad_offset[cache_grad_off_idx] = 0;
+      cache_grad_offset[cache_grad_off_idx + 1] = 0;
+      cache_grad_offset[cache_grad_off_idx + 2] = 0;
+      if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+        ms_deform_attn_col2im_bilinear<scalar_t, transfer_t, d_stride>(
+            p_value_ptr, spatial_h, spatial_w, h_im, w_im, attn, w_stride,
+            base_ptr, spatial_h, spatial_w, top_grad, grad_im_ptr,
+            cache_grad_offset + cache_grad_off_idx);
+        // aggregate across different channel for offset
+        __syncthreads();
+        if (threadIdx.x == 0) {
+          int _didx = (threadIdx.z * G + threadIdx.y) * blockDim.x * 3;
+          opmath_t _grad_w = cache_grad_offset[_didx];
+          opmath_t _grad_h = cache_grad_offset[_didx + 1];
+          opmath_t _grad_a = cache_grad_offset[_didx + 2];
+          for (int c_id = 1; c_id < blockDim.x; ++c_id) {
+            _grad_w += cache_grad_offset[_didx + 3 * c_id];
+            _grad_h += cache_grad_offset[_didx + 3 * c_id + 1];
+            _grad_a += cache_grad_offset[_didx + 3 * c_id + 2];
+          }
+          grad_offset[((bi * Q + qi) * G + gi) * L * K * 3 + li * K * 2 +
+                      ki * 2] = _grad_w;
+          grad_offset[((bi * Q + qi) * G + gi) * L * K * 3 + li * K * 2 +
+                      ki * 2 + 1] = _grad_h;
+          cache_g_mask_before_softmax
+              [((threadIdx.y + threadIdx.z * G) * L + li) * K + ki] = _grad_a;
+        }
+      }
+      __syncthreads();
+      offset_idx += 2;
+      mask_idx += 1;
+    }
+  }
+  // backward for softmax
+  if (threadIdx.x == 0) {
+    for (int i = 0; i < L * K; ++i) {
+      opmath_t grad_i = 0.;
+      const opmath_t *group_g_mask = cache_g_mask_before_softmax +
+                                      (threadIdx.y + threadIdx.z * G) * L * K;
+      for (int j = 0; j < L * K; ++j) {
+        if (i != j) {
+          grad_i -= group_g_mask[j] * p_mask_shm[i] * p_mask_shm[j];
+        } else {
+          grad_i += group_g_mask[i] * p_mask_shm[i] * (1 - p_mask_shm[i]);
+        }
+      }
+      grad_offset[((bi * Q + qi) * G + gi) * L * K * 3 + L * K * 2 + i] =
+          grad_i;
+    }
+  }
+  __syncthreads();
+}
+template <typename scalar_t, int d_stride, typename transfer_t, int L, int K>
+__global__ void
+backward_kernel_warp_primitive(const scalar_t *p_value, const int64_t *data_spatial_shapes,
+                const int64_t *data_level_start_index, const scalar_t *p_offset,
+                const scalar_t *grad_output, const int N, const int G,
+                const int D, const int Q, 
+                const int block_multiplier, opmath_t *grad_im,
+                opmath_t *grad_offset) {
+  extern __shared__ char _s[];
+  const int &qi = (blockIdx.x * block_multiplier % Q) + threadIdx.z;
+  const int &bi = blockIdx.x * block_multiplier / Q;
+  const int &di_s = threadIdx.x * d_stride;
+  const int &gi = threadIdx.y;
+  const int tid = (threadIdx.z * blockDim.y + threadIdx.y)*blockDim.x + threadIdx.x;
+  const int lane_id = tid % kWarpSize;
+  const int group_per_warp = kWarpSize / blockDim.x;
+  const int group_in_warp_id = (threadIdx.z * G + threadIdx.y) % group_per_warp;
+  const unsigned lane_mask = ((1 << blockDim.x) - 1) << (group_in_warp_id * blockDim.x);
+  opmath_t *cache_g_mask_before_softmax =
+      (opmath_t *)(_s); // (block_multiplier*G) * (L * K)
+  opmath_t *const p_mask_shm =
+      ((opmath_t *)(cache_g_mask_before_softmax + block_multiplier * G * L * K)) +
+        (threadIdx.z * G + gi) * L * K; // G*block_multiplier * L * K
+  const scalar_t *p_offset_ptr =
+      p_offset + (((bi * Q + qi) * G + gi) * L) * K * 3;
+  const int mask_length = L * K;
+  const int num_thread = (D / d_stride);
+  const int num_iter = mask_length / num_thread;
+  const int remainder = mask_length - num_iter * num_thread;
+  const scalar_t *top_grad = grad_output + ((bi * Q + qi) * G + gi) * D + di_s;
+  for (int i = 0; i < num_iter; i++) {
+    *(p_mask_shm + num_thread * i + threadIdx.x) =
+        *(scalar_t *)(p_offset_ptr + L * K * 2 + num_thread * i + threadIdx.x);
+  }
+  if (remainder > 0 && threadIdx.x < remainder) {
+    *(p_mask_shm + num_thread * num_iter + threadIdx.x) =
+        *(scalar_t *)(p_offset_ptr + L * K * 2 + num_thread * num_iter +
+                      threadIdx.x);
+  }
+  __syncthreads();
+  // Calculate softmax over L and K
+  if (threadIdx.x == 0) { // gi != 0, di = 0, li = 0
+    opmath_t softmax_max = -1e100;
+    opmath_t softmax_sum = 0.0;
+    // get max
+    for (int j = 0; j < L * K; j++) {
+      softmax_max = max(softmax_max, p_mask_shm[j]);
+    }
+    // get sumexp
+    for (int j = 0; j < L * K; j++) {
+      opmath_t exp_results = exp(p_mask_shm[j] - softmax_max);
+      p_mask_shm[j] = exp_results;
+      softmax_sum += exp_results;
+    }
+    // normalize
+    for (int j = 0; j < L * K; j++) {
+      p_mask_shm[j] /= softmax_sum;
+    }
+  }
+  __syncthreads();
+  int offset_idx = 0;
+  int mask_idx = 0;
+  const int w_stride = G * D;
+  const int base_ptr = gi * D + di_s;
+  for (int li = 0; li < L; li++) {
+    const int spatial_h = data_spatial_shapes[li * 2];
+    const int spatial_w = data_spatial_shapes[li * 2 + 1];
+    const int level_start_id = data_level_start_index[li];
+    const scalar_t *p_value_ptr = p_value + (bi * N + level_start_id) * G * D;
+    opmath_t *grad_im_ptr = grad_im + (bi * N + level_start_id) * G * D;
+    int cache_grad_off_idx =
+        ((threadIdx.z * G + threadIdx.y) * blockDim.x + threadIdx.x) * 3;
+    opmath_t reg_grad_offset[3] = {0.};
+    for (int ki = 0; ki < K; ki++) {
+      const opmath_t loc_w = p_offset_ptr[offset_idx];
+      const opmath_t loc_h = p_offset_ptr[offset_idx + 1];
+      const opmath_t attn = p_mask_shm[mask_idx];
+      const opmath_t h_im = loc_h * spatial_h - 0.5;
+      const opmath_t w_im = loc_w * spatial_w - 0.5;
+      reg_grad_offset[0] = 0;
+      reg_grad_offset[1] = 0;
+      reg_grad_offset[2] = 0;
+      if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+        ms_deform_attn_col2im_bilinear<scalar_t, transfer_t, d_stride>(
+            p_value_ptr, spatial_h, spatial_w, h_im, w_im, attn, w_stride,
+            base_ptr, spatial_h, spatial_w, top_grad, grad_im_ptr,
+            reg_grad_offset);
+        // aggregate across different channel for offset
+        for (uint32_t offset = blockDim.x>>1; offset > 0; offset >>= 1){
+          reg_grad_offset[0] += __shfl_down_sync(lane_mask, reg_grad_offset[0], offset);
+          reg_grad_offset[1] += __shfl_down_sync(lane_mask, reg_grad_offset[1], offset);
+          reg_grad_offset[2] += __shfl_down_sync(lane_mask, reg_grad_offset[2], offset);
+        }
+        if (threadIdx.x == 0) {
+          grad_offset[((bi * Q + qi) * G + gi) * L * K * 3 + li * K * 2 +
+                      ki * 2] = reg_grad_offset[0];
+          grad_offset[((bi * Q + qi) * G + gi) * L * K * 3 + li * K * 2 +
+                      ki * 2 + 1] = reg_grad_offset[1];
+          cache_g_mask_before_softmax
+              [((threadIdx.y + threadIdx.z * G) * L + li) * K + ki] = reg_grad_offset[2];
+        }
+      }
+      __syncthreads();
+      offset_idx += 2;
+      mask_idx += 1;
+    }
+  }
+  // backward for softmax
+  if (threadIdx.x == 0) {
+    for (int i = 0; i < L * K; ++i) {
+      opmath_t grad_i = 0.;
+      const opmath_t *group_g_mask = cache_g_mask_before_softmax +
+                                      (threadIdx.y + threadIdx.z * G) * L * K;
+      for (int j = 0; j < L * K; ++j) {
+        if (i != j) {
+          grad_i -= group_g_mask[j] * p_mask_shm[i] * p_mask_shm[j];
+        } else {
+          grad_i += group_g_mask[i] * p_mask_shm[i] * (1 - p_mask_shm[i]);
+        }
+      }
+      grad_offset[((bi * Q + qi) * G + gi) * L * K * 3 + L * K * 2 + i] =
+          grad_i;
+    }
+  }
+  __syncthreads();
+}
+template <typename scalar_t, typename stride_type, int K, int d_stride>
+void _flash_deformable_col2im_cuda(
+    cudaStream_t stream,
+    const scalar_t *value,                 // B, N, G, D
+    const int64_t *data_spatial_shapes,    // L * 2
+    const int64_t *data_level_start_index, // L
+    const scalar_t *offset,                // B, N, G, L, K, 3
+    const scalar_t *grad_output,           // B, N, G, D
+    const int B, const int N, const int G, const int D, const int L,
+    const int Q, opmath_t *grad_im, opmath_t *grad_offset,
+    const int block_thread) {
+  assert(D % d_stride == 0);
+  const int block_multiplier = block_thread / (D / d_stride) / G;
+  assert((B*Q) % block_multiplier == 0);
+  dim3 num_blocks(B*Q / block_multiplier);
+  dim3 num_threads(D / d_stride, G, block_multiplier);
+  int shm_size;
+  if(check_backward_warpp(d_stride, D)){
+    shm_size =
+      sizeof(opmath_t) * (block_multiplier * G * L * K) +
+      sizeof(opmath_t) * (G * block_multiplier * L * K);
+  }
+  else{
+    shm_size =
+      sizeof(opmath_t) * (block_multiplier * G * L * K) +
+      sizeof(opmath_t) * (G * block_multiplier * L * K) + 
+      sizeof(opmath_t) * (G * block_multiplier * D / d_stride * 3);
+  }
+  auto kernel = backward_kernel_warp_primitive<scalar_t, d_stride, stride_type, 1, K>;
+  switch (L) {
+  case 1:
+    if(check_backward_warpp(d_stride, D)){
+      kernel = backward_kernel_warp_primitive<scalar_t, d_stride, stride_type, 1, K>;
+    } else {
+      kernel = backward_kernel<scalar_t, d_stride, stride_type, 1, K>;
+    }
+    break;
+  case 2:
+    if(check_backward_warpp(d_stride, D)){
+      kernel = backward_kernel_warp_primitive<scalar_t, d_stride, stride_type, 2, K>;
+    } else {
+      kernel = backward_kernel<scalar_t, d_stride, stride_type, 2, K>;
+    }
+    break;
+  case 3:
+    if(check_backward_warpp(d_stride, D)){
+      kernel = backward_kernel_warp_primitive<scalar_t, d_stride, stride_type, 3, K>;
+    } else {
+      kernel = backward_kernel<scalar_t, d_stride, stride_type, 3, K>;
+    }
+    break;
+  case 4:
+    if(check_backward_warpp(d_stride, D)){
+      kernel = backward_kernel_warp_primitive<scalar_t, d_stride, stride_type, 4, K>;
+    } else {
+      kernel = backward_kernel<scalar_t, d_stride, stride_type, 4, K>;
+    }
+    break;
+  case 5:
+    if(check_backward_warpp(d_stride, D)){
+      kernel = backward_kernel_warp_primitive<scalar_t, d_stride, stride_type, 5, K>;
+    } else {
+      kernel = backward_kernel<scalar_t, d_stride, stride_type, 5, K>;
+    }
+    break;
+  default:
+    printf("L=%ld\n", L);
+    throw std::invalid_argument("invalid number of scales");
+  }
+  cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize,
+                       shm_size);
+  kernel<<<num_blocks, num_threads, shm_size, stream>>>(
+      value, data_spatial_shapes, data_level_start_index, offset, grad_output,
+      N, G, D, Q, block_multiplier, grad_im, grad_offset);
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf("error in flash_deformable_im2col_cuda: %s\n",
+           cudaGetErrorString(err));
+    printf("launch arguments: gridDim=(%d, %d, %d), blockDim=(%d, %d, %d), "
+           "shm_size=%d, Q=%d\n\n",
+           num_blocks.x, num_blocks.y, num_blocks.z, num_threads.x,
+           num_threads.y, num_threads.z, shm_size, Q);
+    AT_ASSERTM(false, "kernel launch error");
+  }
+}
+template <typename scalar_t, int K>
+void flash_deformable_col2im_cuda_inner(
+    cudaStream_t stream,
+    const scalar_t *value,                 // B, N, G, D
+    const int64_t *data_spatial_shapes,    // L * 2
+    const int64_t *data_level_start_index, // L
+    const scalar_t *offset,                // B, N, G, L, K, 3
+    const scalar_t *grad_output,           // B, N, G, D
+    const int B, const int N, const int G, const int D, const int L,
+    const int Q, opmath_t *grad_im, opmath_t *grad_offset, 
+    const int d_stride, const int block_thread) {
+  assert(D % d_stride == 0);
+  if(sizeof(scalar_t) == 2) {
+    switch(d_stride) {
+    case 1:
+      _flash_deformable_col2im_cuda<scalar_t, scalar_t, K, 1>(
+          stream,
+          value,                  // B, N, G, D
+          data_spatial_shapes,    // L * 2
+          data_level_start_index, // L
+          offset,                 // B, N, G, L, K, 3
+          grad_output,            // B, N, G, D
+          B, N, G, D, L, Q, grad_im, grad_offset,
+          block_thread);
+      break;
+    case 2:
+      _flash_deformable_col2im_cuda<scalar_t, uint, K, 2>(
+          stream,
+          value,                  // B, N, G, D
+          data_spatial_shapes,    // L * 2
+          data_level_start_index, // L
+          offset,                 // B, N, G, L, K, 3
+          grad_output,            // B, N, G, D
+          B, N, G, D, L, Q, grad_im, grad_offset,
+          block_thread);
+      break;
+    case 4:
+      _flash_deformable_col2im_cuda<scalar_t, uint2, K, 4>(
+          stream,
+          value,                  // B, N, G, D
+          data_spatial_shapes,    // L * 2
+          data_level_start_index, // L
+          offset,                 // B, N, G, L, K, 3
+          grad_output,            // B, N, G, D
+          B, N, G, D, L, Q, grad_im, grad_offset,
+          block_thread);
+      break;
+    case 8:
+      _flash_deformable_col2im_cuda<scalar_t, uint4, K, 8>(
+          stream,
+          value,                  // B, N, G, D
+          data_spatial_shapes,    // L * 2
+          data_level_start_index, // L
+          offset,                 // B, N, G, L, K, 3
+          grad_output,            // B, N, G, D
+          B, N, G, D, L, Q, grad_im, grad_offset,
+          block_thread);
+      break;
+    case 16:
+      _flash_deformable_col2im_cuda<scalar_t, ulonglong4, K, 16>(
+          stream,
+          value,                  // B, N, G, D
+          data_spatial_shapes,    // L * 2
+          data_level_start_index, // L
+          offset,                 // B, N, G, L, K, 3
+          grad_output,            // B, N, G, D
+          B, N, G, D, L, Q, grad_im, grad_offset,
+          block_thread);
+      break;
+    default:
+      printf("not supported for d_stride > 16 for fp16");
+      throw std::invalid_argument("invalid d_stride");
+    }
+  } else {
+    assert(sizeof(scalar_t) == 4);
+    switch(d_stride) {
+    case 1:  
+      _flash_deformable_col2im_cuda<scalar_t, scalar_t, K, 1>(
+        stream,
+        value,                  // B, N, G, D
+        data_spatial_shapes,    // L * 2
+        data_level_start_index, // L
+        offset,                 // B, N, G, L, K, 3
+        grad_output,            // B, N, G, D
+        B, N, G, D, L, Q, grad_im, grad_offset,
+        block_thread);
+      break;
+    case 2:  
+      _flash_deformable_col2im_cuda<scalar_t, uint2, K, 2>(
+        stream,
+        value,                  // B, N, G, D
+        data_spatial_shapes,    // L * 2
+        data_level_start_index, // L
+        offset,                 // B, N, G, L, K, 3
+        grad_output,            // B, N, G, D
+        B, N, G, D, L, Q, grad_im, grad_offset,
+        block_thread);
+      break;
+    case 4:  
+      _flash_deformable_col2im_cuda<scalar_t, uint4, K, 4>(
+        stream,
+        value,                  // B, N, G, D
+        data_spatial_shapes,    // L * 2
+        data_level_start_index, // L
+        offset,                 // B, N, G, L, K, 3
+        grad_output,            // B, N, G, D
+        B, N, G, D, L, Q, grad_im, grad_offset,
+        block_thread);
+      break;
+    case 8:  
+      _flash_deformable_col2im_cuda<scalar_t, ulonglong4, K, 8>(
+        stream,
+        value,                  // B, N, G, D
+        data_spatial_shapes,    // L * 2
+        data_level_start_index, // L
+        offset,                 // B, N, G, L, K, 3
+        grad_output,            // B, N, G, D
+        B, N, G, D, L, Q, grad_im, grad_offset,
+        block_thread);
+      break;
+    default:
+      printf("not supported for d_stride > 8 for fp32");
+      throw std::invalid_argument("invalid d_stride");
+    }
+  }
+}
+template <typename scalar_t>
+void flash_deformable_col2im_cuda(
+    cudaStream_t stream,
+    const scalar_t *value,                 // B, N, G, D
+    const int64_t *data_spatial_shapes,    // L * 2
+    const int64_t *data_level_start_index, // L
+    const scalar_t *offset,                // B, N, G, L, K, 3
+    const scalar_t *grad_output,           // B, N, G, D
+    const int B, const int N, const int G, const int D, const int L,
+    const int Q, const int K, opmath_t *grad_im, opmath_t *grad_offset,
+    const int d_stride, const int block_thread) {
+  switch (K) {
+  case 4:
+    flash_deformable_col2im_cuda_inner<scalar_t, 4>(
+        stream,
+        value,                  // B, N, G, D
+        data_spatial_shapes,    // L * 2
+        data_level_start_index, // L
+        offset,                 // B, N, G, L, K, 3
+        grad_output,            // B, N, G, D
+        B, N, G, D, L, Q, grad_im, grad_offset,
+        d_stride, block_thread);
+    break;
+  case 8:
+    flash_deformable_col2im_cuda_inner<scalar_t, 8>(
+        stream,
+        value,                  // B, N, G, D
+        data_spatial_shapes,    // L * 2
+        data_level_start_index, // L
+        offset,                 // B, N, G, L, K, 3
+        grad_output,            // B, N, G, D
+        B, N, G, D, L, Q, grad_im, grad_offset,
+        d_stride, block_thread);
+    break;
+  default:
+    printf("not supported for K not in [4, 8]");
+    throw std::invalid_argument("invalid K");
+  }
+}
\ No newline at end of file
--- a/DCNv4_op/src/cuda/flash_deform_im2col_cuda.cuh
+++ b/DCNv4_op/src/cuda/flash_deform_im2col_cuda.cuh
+/*!
+**************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************
+* Modified from DCN (https://github.com/msracver/Deformable-ConvNets)
+* Copyright (c) 2018 Microsoft
+**************************************************************************
+*/
+#include <algorithm>
+#include <cstdio>
+#include <cstring>
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <THC/THCAtomics.cuh>
+#include <ATen/ATen.h>
+#include <ATen/OpMathType.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cooperative_groups.h>
+#include <cooperative_groups/memcpy_async.h>
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include "common.h"
+template <typename scalar_t, int d_stride, typename transfer_t, int L, int K>
+__global__ void
+forward_kernel(const scalar_t *p_value, const int64_t *data_spatial_shapes,
+               const int64_t *data_level_start_index, const scalar_t *p_offset,
+               scalar_t *p_output, const int N, const int G, const int D,
+               const int Q, const int block_multiplier) {
+  extern __shared__ char _s[];
+  const int &qi = (blockIdx.x * block_multiplier % Q) + threadIdx.z;
+  const int &bi = blockIdx.x * block_multiplier / Q;
+  const int &di_s = threadIdx.x * d_stride;
+  const int &gi = threadIdx.y;
+  opmath_t p_out_shm[d_stride] = {0.};
+  opmath_t *const p_mask_shm =
+      (opmath_t *)(_s) + (threadIdx.z * G + gi) * L * K;
+  const scalar_t *p_offset_ptr =
+      p_offset + (((bi * Q + qi) * G + gi) * L) * K * 3;
+  const int mask_length = L * K;
+  const int num_thread = (D / d_stride);
+  const int num_iter = mask_length / num_thread;
+  const int remainder = mask_length - num_iter * num_thread;
+  for (int i = 0; i < num_iter; i++) {
+    *(p_mask_shm + num_thread * i + threadIdx.x) =
+        *(scalar_t *)(p_offset_ptr + L * K * 2 + num_thread * i + threadIdx.x);
+  }
+  if (remainder > 0 && threadIdx.x < remainder) {
+    *(p_mask_shm + num_thread * num_iter + threadIdx.x) =
+        *(scalar_t *)(p_offset_ptr + L * K * 2 + num_thread * num_iter +
+                      threadIdx.x);
+  }
+  __syncthreads();
+  // Calculate softmax over L and K
+  if (threadIdx.x == 0) { // di = 0
+    opmath_t softmax_max = -1e100;
+    opmath_t softmax_sum = 0.0;
+    // get max
+    for (int j = 0; j < L * K; j++) {
+      softmax_max = max(softmax_max, p_mask_shm[j]);
+    }
+    // get sumexp
+    for (int j = 0; j < L * K; j++) {
+      opmath_t exp_results = exp(p_mask_shm[j] - softmax_max);
+      p_mask_shm[j] = exp_results;
+      softmax_sum += exp_results;
+    }
+    // normalize
+    for (int j = 0; j < L * K; j++) {
+      p_mask_shm[j] /= softmax_sum;
+    }
+  }
+  __syncthreads();
+  int offset_idx = 0;
+  int mask_idx = 0;
+  const int w_stride = G * D;
+  const int base_ptr = gi * D + di_s;
+  for (int li = 0; li < L; li++) {
+    const int spatial_h = data_spatial_shapes[li * 2];
+    const int spatial_w = data_spatial_shapes[li * 2 + 1];
+    const int level_start_id = data_level_start_index[li];
+    const scalar_t *p_value_ptr = p_value + (bi * N + level_start_id) * G * D;
+    for (int ki = 0; ki < K; ki++) {
+      const opmath_t loc_w = p_offset_ptr[offset_idx];
+      const opmath_t loc_h = p_offset_ptr[offset_idx + 1];
+      const opmath_t attn = p_mask_shm[mask_idx];
+      const opmath_t h_im = loc_h * spatial_h - 0.5;
+      const opmath_t w_im = loc_w * spatial_w - 0.5;
+      if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+        ms_deform_attn_im2col_bilinear<scalar_t, transfer_t, d_stride>(
+            p_out_shm, p_value_ptr, spatial_h, spatial_w, h_im, w_im, attn,
+            w_stride, base_ptr);
+      }
+      offset_idx += 2;
+      mask_idx += 1;
+    }
+  }
+  int out_idx = ((bi * Q + qi) * G + gi) * D + di_s;
+  scalar_t *fp16_regs = (scalar_t *)(p_out_shm);
+#pragma unroll
+  for (int ds = 0; ds < d_stride; ds++) {
+    fp16_regs[ds] = p_out_shm[ds];
+  }
+  *(transfer_t *)(p_output + out_idx) = *(transfer_t *)(p_out_shm);
+}
+template <typename scalar_t, int d_stride, typename transfer_t, int L, int K>
+__global__ void
+forward_kernel_reg(const scalar_t *p_value, const int64_t *data_spatial_shapes,
+               const int64_t *data_level_start_index, const scalar_t *p_offset,
+               scalar_t *p_output, const int N, const int G, const int D,
+               const int Q, const int block_multiplier) {
+  const int &qi = (blockIdx.x * block_multiplier % Q) + threadIdx.z;
+  const int &bi = blockIdx.x * block_multiplier / Q;
+  const int &di_s = threadIdx.x * d_stride;
+  const int &gi = threadIdx.y;
+  opmath_t p_out_shm[d_stride] = {0.};
+  opmath_t p_mask_shm[L*K] = {0.};
+  const scalar_t *p_offset_ptr =
+      p_offset + (((bi * Q + qi) * G + gi) * L) * K * 3;
+  for (int i=0; i < L*K; i++){
+    p_mask_shm[i] = *(p_offset_ptr + L * K * 2 + i);
+  }
+  // Calculate softmax over L and K
+  opmath_t softmax_max = -1e100;
+  opmath_t softmax_sum = 0.0;
+  // get max
+  for (int j = 0; j < L * K; j++) {
+    softmax_max = max(softmax_max, p_mask_shm[j]);
+  }
+  // get sumexp
+  for (int j = 0; j < L * K; j++) {
+    opmath_t exp_results = exp(p_mask_shm[j] - softmax_max);
+    p_mask_shm[j] = exp_results;
+    softmax_sum += exp_results;
+  }
+  // normalize
+  for (int j = 0; j < L * K; j++) {
+    p_mask_shm[j] /= softmax_sum;
+  }
+  int offset_idx = 0;
+  int mask_idx = 0;
+  const int w_stride = G * D;
+  const int base_ptr = gi * D + di_s;
+  for (int li = 0; li < L; li++) {
+    const int spatial_h = data_spatial_shapes[li * 2];
+    const int spatial_w = data_spatial_shapes[li * 2 + 1];
+    const int level_start_id = data_level_start_index[li];
+    const scalar_t *p_value_ptr = p_value + (bi * N + level_start_id) * G * D;
+    for (int ki = 0; ki < K; ki++) {
+      const opmath_t loc_w = p_offset_ptr[offset_idx];
+      const opmath_t loc_h = p_offset_ptr[offset_idx + 1];
+      const opmath_t attn = p_mask_shm[mask_idx];
+      const opmath_t h_im = loc_h * spatial_h - 0.5;
+      const opmath_t w_im = loc_w * spatial_w - 0.5;
+      if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+        ms_deform_attn_im2col_bilinear<scalar_t, transfer_t, d_stride>(
+            p_out_shm, p_value_ptr, spatial_h, spatial_w, h_im, w_im, attn,
+            w_stride, base_ptr);
+      }
+      offset_idx += 2;
+      mask_idx += 1;
+    }
+  }
+  int out_idx = ((bi * Q + qi) * G + gi) * D + di_s;
+  scalar_t *fp16_regs = (scalar_t *)(p_out_shm);
+#pragma unroll
+  for (int ds = 0; ds < d_stride; ds++) {
+    fp16_regs[ds] = p_out_shm[ds];
+  }
+  *(transfer_t *)(p_output + out_idx) = *(transfer_t *)(p_out_shm);
+}
+template <typename scalar_t, typename stride_type, int K, int d_stride>
+void _flash_deformable_im2col_cuda(
+    cudaStream_t stream,
+    const scalar_t *value,                 // B, N, G, D
+    const int64_t *data_spatial_shapes,    // L * 2
+    const int64_t *data_level_start_index, // L
+    const scalar_t *offset,                // B, N, G, L, K, 3
+    scalar_t *output,                      // B, N, G, D
+    const int B, const int N, const int G, const int D, const int L,
+    const int Q, const int block_thread, 
+    const bool _use_reg) {
+  assert(D % d_stride == 0);
+  const int block_multiplier = block_thread / (D / d_stride) / G;;
+  assert((B*Q) % block_multiplier == 0);
+  dim3 num_blocks(B*Q / block_multiplier);
+  dim3 num_threads(D / d_stride, G, block_multiplier);
+  const int shm_size = 0;
+  auto kernel = forward_kernel_reg<scalar_t, d_stride, stride_type, 1, K>;
+  switch (L) {
+  case 1:
+    kernel = forward_kernel_reg<scalar_t, d_stride, stride_type, 1, K>;
+    break;
+  case 2:
+    kernel = forward_kernel_reg<scalar_t, d_stride, stride_type, 2, K>;
+    break;
+  case 3:
+    kernel = forward_kernel_reg<scalar_t, d_stride, stride_type, 3, K>;
+    break;
+  case 4:
+    kernel = forward_kernel_reg<scalar_t, d_stride, stride_type, 4, K>;
+    break;
+  case 5:
+    kernel = forward_kernel_reg<scalar_t, d_stride, stride_type, 5, K>;
+    break;
+  default:
+    printf("L=%ld\n", L);
+    throw std::invalid_argument("invalid number of scales");
+  }
+  cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize,
+                       shm_size);
+  kernel<<<num_blocks, num_threads, shm_size, stream>>>(
+      value, data_spatial_shapes, data_level_start_index, offset, output, N, G,
+      D, Q, block_multiplier);
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf("error in flash_deformable_im2col_cuda: %s\n",
+           cudaGetErrorString(err));
+    printf("launch arguments: gridDim=(%d, %d, %d), blockDim=(%d, %d, %d), "
+           "shm_size=%d, Q=%d\n\n",
+           num_blocks.x, num_blocks.y, num_blocks.z, num_threads.x,
+           num_threads.y, num_threads.z, shm_size, Q);
+    AT_ASSERTM(false, "kernel launch error");
+  }
+}
+template <typename scalar_t, int K>
+void flash_deformable_im2col_cuda_inner(
+    cudaStream_t stream,
+    const scalar_t *value,                 // B, N, G, D
+    const int64_t *data_spatial_shapes,    // L * 2
+    const int64_t *data_level_start_index, // L
+    const scalar_t *offset,                // B, N, G, L, K, 3
+    scalar_t *output,                      // B, N, G, D
+    const int B, const int N, const int G, const int D, const int L,
+    const int Q, const int d_stride, 
+    const int block_thread,
+    const bool _use_reg) {
+  assert(D % d_stride == 0);
+  if(sizeof(scalar_t) == 2) {
+    switch(d_stride) {
+    case 1:
+      _flash_deformable_im2col_cuda<scalar_t, scalar_t, K, 1>(
+          stream,
+          value,                  // B, N, G, D
+          data_spatial_shapes,    // L * 2
+          data_level_start_index, // L
+          offset,                 // B, N, G, L, K, 3
+          output,                 // B, N, G, D
+          B, N, G, D, L, Q,
+          block_thread,
+          _use_reg);
+      break;
+    case 2:
+      _flash_deformable_im2col_cuda<scalar_t, uint, K, 2>(
+          stream,
+          value,                  // B, N, G, D
+          data_spatial_shapes,    // L * 2
+          data_level_start_index, // L
+          offset,                 // B, N, G, L, K, 3
+          output,                 // B, N, G, D
+          B, N, G, D, L, Q,
+          block_thread,
+          _use_reg);
+      break;
+    case 4:
+      _flash_deformable_im2col_cuda<scalar_t, uint2, K, 4>(
+          stream,
+          value,                  // B, N, G, D
+          data_spatial_shapes,    // L * 2
+          data_level_start_index, // L
+          offset,                 // B, N, G, L, K, 3
+          output,                 // B, N, G, D
+          B, N, G, D, L, Q,
+          block_thread,
+          _use_reg);
+      break;
+    case 8:
+      _flash_deformable_im2col_cuda<scalar_t, uint4, K, 8>(
+          stream,
+          value,                  // B, N, G, D
+          data_spatial_shapes,    // L * 2
+          data_level_start_index, // L
+          offset,                 // B, N, G, L, K, 3
+          output,                 // B, N, G, D
+          B, N, G, D, L, Q,
+          block_thread,
+          _use_reg);
+      break;
+    case 16:
+      _flash_deformable_im2col_cuda<scalar_t, ulonglong4, K, 16>(
+          stream,
+          value,                  // B, N, G, D
+          data_spatial_shapes,    // L * 2
+          data_level_start_index, // L
+          offset,                 // B, N, G, L, K, 3
+          output,                 // B, N, G, D
+          B, N, G, D, L, Q,
+          block_thread,
+          _use_reg);
+      break;
+    default:
+      printf("not supported for d_stride > 16 for fp16");
+      throw std::invalid_argument("invalid d_stride");
+    }
+  } else {
+    assert(sizeof(scalar_t) == 4);
+    switch(d_stride) {
+    case 1:
+      _flash_deformable_im2col_cuda<scalar_t, scalar_t, K, 1>(
+          stream,
+          value,                  // B, N, G, D
+          data_spatial_shapes,    // L * 2
+          data_level_start_index, // L
+          offset,                 // B, N, G, L, K, 3
+          output,                 // B, N, G, D
+          B, N, G, D, L, Q,
+          block_thread,
+          _use_reg);
+      break;
+    case 2:
+      _flash_deformable_im2col_cuda<scalar_t, uint2, K, 2>(
+          stream,
+          value,                  // B, N, G, D
+          data_spatial_shapes,    // L * 2
+          data_level_start_index, // L
+          offset,                 // B, N, G, L, K, 3
+          output,                 // B, N, G, D
+          B, N, G, D, L, Q,
+          block_thread,
+          _use_reg);
+      break;
+    case 4:
+      _flash_deformable_im2col_cuda<scalar_t, uint4, K, 4>(
+          stream,
+          value,                  // B, N, G, D
+          data_spatial_shapes,    // L * 2
+          data_level_start_index, // L
+          offset,                 // B, N, G, L, K, 3
+          output,                 // B, N, G, D
+          B, N, G, D, L, Q,
+          block_thread,
+          _use_reg);
+      break;
+    case 8:
+      _flash_deformable_im2col_cuda<scalar_t, ulonglong4, K, 8>(
+          stream,
+          value,                  // B, N, G, D
+          data_spatial_shapes,    // L * 2
+          data_level_start_index, // L
+          offset,                 // B, N, G, L, K, 3
+          output,                 // B, N, G, D
+          B, N, G, D, L, Q,
+          block_thread,
+          _use_reg);
+      break;
+    default:
+      printf("not supported for d_stride > 8 for fp32");
+      throw std::invalid_argument("invalid d_stride");
+    }
+  }
+}
+template <typename scalar_t>
+void flash_deformable_im2col_cuda(
+    cudaStream_t stream,
+    const scalar_t *value,                 // B, N, G, D
+    const int64_t *data_spatial_shapes,    // L * 2
+    const int64_t *data_level_start_index, // L
+    const scalar_t *offset,                // B, N, G, L, K, 3
+    scalar_t *output,                      // B, N, G, D
+    const int B, const int N, const int G, const int D, const int L,
+    const int Q, const int K, const int d_stride, 
+    const int block_thread,
+    const bool _use_reg) {
+  switch (K) {
+  case 4:
+    flash_deformable_im2col_cuda_inner<scalar_t, 4>(
+        stream,
+        value,                  // B, N, G, D
+        data_spatial_shapes,    // L * 2
+        data_level_start_index, // L
+        offset,                 // B, N, G, L, K, 3
+        output,                 // B, N, G, D
+        B, N, G, D, L, Q, d_stride, 
+        block_thread, _use_reg);
+    break;
+  case 8:
+    flash_deformable_im2col_cuda_inner<scalar_t, 8>(
+        stream,
+        value,                  // B, N, G, D
+        data_spatial_shapes,    // L * 2
+        data_level_start_index, // L
+        offset,                 // B, N, G, L, K, 3
+        output,                 // B, N, G, D
+        B, N, G, D, L, Q, d_stride,
+        block_thread, _use_reg);
+    break;
+  default:
+    printf("not supported for K not in [4, 8]");
+    throw std::invalid_argument("invalid K");
+  }
+}
\ No newline at end of file
--- a/DCNv4_op/src/dcnv4.h
+++ b/DCNv4_op/src/dcnv4.h
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from
+*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+#pragma once
+#ifdef WITH_CUDA
+#include "cuda/dcnv4_cuda.h"
+#include "cuda/flash_deform_attn_cuda.h"
+#endif
+at::Tensor flash_deform_attn_forward(const at::Tensor &value,
+                                        const at::Tensor &spatial_shapes,
+                                        const at::Tensor &level_start_index,
+                                        const at::Tensor &sampling_loc_attn,
+                                        const int im2col_step, const int K,
+                                        const int d_stride, const int block_thread) {
+  if (value.device().is_cuda()) {
+#ifdef WITH_CUDA
+    return flash_deform_attn_cuda_forward(value, spatial_shapes,
+                                             level_start_index,
+                                             sampling_loc_attn, im2col_step, 
+                                             K, d_stride, block_thread);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  AT_ERROR("Not implemented on the CPU");
+}
+std::vector<at::Tensor>
+flash_deform_attn_backward(const at::Tensor &value, 
+                              const at::Tensor &spatial_shapes,
+                              const at::Tensor &level_start_index, 
+                              const at::Tensor &sampling_loc_attn,
+                              const at::Tensor &grad_output, 
+                              const int im2col_step, 
+                              const int K, 
+                              const int d_stride, const int block_thread){
+  if (value.device().is_cuda()) {
+#ifdef WITH_CUDA
+    return flash_deform_attn_cuda_backward(value, 
+                                              spatial_shapes,
+                                              level_start_index,
+                                              sampling_loc_attn, 
+                                              grad_output,
+                                              im2col_step, 
+                                              K, d_stride, 
+                                              block_thread);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  AT_ERROR("Not implemented on the CPU");
+}
+at::Tensor dcnv4_forward(
+    const at::Tensor &value,
+    const at::Tensor &p_offset,
+    const int kernel_h, const int kernel_w, const int stride_h,
+    const int stride_w, const int pad_h, const int pad_w, const int dilation_h,
+    const int dilation_w, const int group, const int group_channels,
+    const float offset_scale, const int im2col_step, const int remove_center,
+    const int d_stride, const int block_thread, const bool softmax) {
+  if (value.device().is_cuda()) {
+#ifdef WITH_CUDA
+    return dcnv4_cuda_forward(
+        value, p_offset, kernel_h, kernel_w, stride_h, stride_w, pad_h,
+        pad_w, dilation_h, dilation_w, group, group_channels, offset_scale,
+        im2col_step, remove_center, d_stride, block_thread, softmax);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  AT_ERROR("Not implemented on the CPU");
+}
+std::vector<at::Tensor>
+dcnv4_backward(
+    const at::Tensor &value, 
+    const at::Tensor &p_offset, 
+    const int kernel_h, const int kernel_w, const int stride_h,
+    const int stride_w, const int pad_h, const int pad_w, const int dilation_h,
+    const int dilation_w, const int group, const int group_channels,
+    const float offset_scale, const int im2col_step, const at::Tensor &grad_output, 
+    const int remove_center, const int d_stride, const int block_thread,
+    const bool softmax){
+  if (value.device().is_cuda()) {
+#ifdef WITH_CUDA
+    return dcnv4_cuda_backward(
+        value, p_offset, kernel_h, kernel_w, stride_h, stride_w, pad_h,
+        pad_w, dilation_h, dilation_w, group, group_channels, offset_scale,
+        im2col_step, grad_output, remove_center, d_stride, block_thread,
+        softmax);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+  AT_ERROR("Not implemented on the CPU");
+}
\ No newline at end of file
--- a/DCNv4_op/src/vision.cpp
+++ b/DCNv4_op/src/vision.cpp
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from
+*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+#include "dcnv4.h"
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("flash_deform_attn_forward", &flash_deform_attn_forward,
+        "flash_deform_attn_forward");
+  m.def("flash_deform_attn_backward", &flash_deform_attn_backward,
+        "flash_deform_attn_backward");
+  m.def("dcnv4_forward", &dcnv4_forward, "dcnv4_forward");
+  m.def("dcnv4_backward", &dcnv4_backward, "dcnv4_backward");
+}
--- a/LICENSE
+++ b/LICENSE
+MIT License
+Copyright (c) 2022 OpenGVLab
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/README_origin.md
+++ b/README_origin.md
+# [DCNv4](https://arxiv.org/pdf/2401.06197.pdf)
+## News
+- `Jan 15, 2024`: 🚀 Compared with InternImage, the new FlashInternImage powered with DCNv4 has faster inference speed, faster convergence, and better performance!!!
+- `Jan 15, 2024`: 🚀 "DCNv4" is released！
+## Introduction
+We introduce Deformable Convolution v4 (DCNv4), a highly efficient and effective operator designed for a broad spectrum of vision applications. DCNv4 addresses the limitations of its predecessor, DCNv3, with two key enhancements: 1. removing softmax normalization in spatial aggregation to enhance its dynamic property and expressive power and 2. optimizing memory access to minimize redundant operations for speedup. These improvements result in a significantly faster convergence compared to DCNv3 and a substantial increase in processing speed, with DCNv4 achieving more than three times the forward speed.
+DCNv4 demonstrates exceptional performance across various tasks, including image classification, instance and semantic segmentation, and notably, image generation. 
+When integrated into generative models like U-Net in the latent diffusion model, DCNv4 outperforms its baseline, underscoring its possibility to enhance generative models.
+In practical applications, replacing DCNv3 with DCNv4 in the InternImage model to create FlashInternImage results in up to 80\% speed increase and further performance improvement without further modifications.
+The advancements in speed and efficiency of DCNv4, combined with its robust performance across diverse vision tasks, show its potential as a foundational building block for future vision models.
+## Released Models
+<details>
+<summary> ImageNet Image Classification </summary>
+<br>
+<div>
+|      name      |   pretrain   | resolution | acc@1 | #param | download                                                                              |
+| :------------: | :----------: | :--------: | :---: | :----: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| FlashInternImage-T  | ImageNet-1K  |  224x224   | 83.6  |  30M   |        [ckpt](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/flash_intern_image_t_1k_224.pth) \| [cfg](classification/configs/flash_intern_image_t_1k_224.yaml)       |
+| FlashInternImage-S  | ImageNet-1K  |  224x224   | 84.4  |  50M   |        [ckpt](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/flash_intern_image_s_1k_224.pth) \| [cfg](classification/configs/flash_intern_image_s_1k_224.yaml)       |
+| FlashInternImage-B  | ImageNet-1K  |  224x224   | 84.9  |  97M   |      [ckpt](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/flash_intern_image_b_1k_224.pth) \| [cfg](classification/configs/flash_intern_image_b_1k_224.yaml)       |
+| FlashInternImage-L  | ImageNet-22K |  384x384   | 88.1  |  223M  | [ckpt](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/flash_internimage_l_22kto1k_384.pth) \| [cfg](classification/configs/flash_intern_image_l_22kto1k_384.yaml)  |
+</div>
+</details>
+<details>
+<summary> COCO Object Detection and Instance Segmentation </summary>
+<br>
+<div>
+|    backbone   |method |  schd | box mAP | mask mAP |Config | Download | 
+| :-----------------:| :----------:  |  :---------: | :-----:  |:------: | :-----: | :---: |
+| FlashInternImage-T  |Mask-RCNN|          1x      |  48.0   |   43.1    |  [config](./detection/configs/coco/mask_rcnn_flash_intern_image_t_fpn_1x_coco.py) | [ckpt](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/mask_rcnn_flash_internimage_t_fpn_1x_coco.pth) \| [log](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/mask_rcnn_flash_internimage_t_fpn_1x_coco.log) |
+| FlashInternImage-T  |Mask-RCNN |          3x      |  49.5   |   44.0     | [config](./detection/configs/coco/mask_rcnn_flash_intern_image_t_fpn_3x_coco.py) | [ckpt](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/mask_rcnn_flash_internimage_t_fpn_3x_coco.pth) \| [log](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/mask_rcnn_flash_internimage_t_fpn_3x_coco.log) |
+| FlashInternImage-S   |Mask-RCNN|          1x      |  49.2   |   44.0    |  [config](./detection/configs/coco/mask_rcnn_flash_intern_image_s_fpn_1x_coco.py) | [ckpt](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/mask_rcnn_flash_internimage_s_fpn_1x_coco.pth) \| [log](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/mask_rcnn_flash_internimage_s_fpn_1x_coco.log) |
+| FlashInternImage-S  |Mask-RCNN |          3x      |  50.5   |   44.9   | [config](./detection/configs/coco/mask_rcnn_flash_intern_image_s_fpn_3x_coco.py) | [ckpt](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/mask_rcnn_flash_internimage_s_fpn_3x_coco.pth) \| [log](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/mask_rcnn_flash_internimage_s_fpn_3x_coco.log) |
+| FlashInternImage-B  |Mask-RCNN |          1x      |  50.1   |   44.5  | [config](./detection/configs/coco/mask_rcnn_flash_intern_image_b_fpn_1x_coco.py) | [ckpt](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/mask_rcnn_flash_internimage_b_fpn_1x_coco.pth) \| [log](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/mask_rcnn_flash_internimage_b_fpn_1x_coco.log) |
+| FlashInternImage-B   |Mask-RCNN|          3x      |  50.6   |   45.4  |  [config](./detection/configs/coco/mask_rcnn_flash_intern_image_b_fpn_3x_coco.py)| [ckpt](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/mask_rcnn_flash_internimage_b_fpn_3x_coco.pth) \| [log](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/mask_rcnn_flash_internimage_b_fpn_3x_coco.log) |
+|    backbone    |   method|      schd | box mAP | mask mAP | Config | Download |
+| :------------:|  :---------: |  :---------: | :-----: | :------: | :---: | :---: |
+| FlashInternImage-L |Cascade Mask R-CNN |        1x      |  55.6   |   48.2     | [config](./detection/configs/coco/cascade_flash_intern_image_l_fpn_1x_coco.py) | [ckpt](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/cascade_flash_internimage_l_fpn_1x_coco.pth) \| [log](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/cascade_flash_internimage_l_fpn_1x_coco.log)
+| FlashInternImage-L |Cascade Mask R-CNN |        3x      |  56.7   |   48.9    | [config](./detection/configs/coco/cascade_flash_intern_image_l_fpn_3x_coco.py) | [ckpt](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/cascade_flash_internimage_l_fpn_3x_coco.pth)  |
+|    backbone   |method |  lr type     | pretrain    |       schd | box mAP  | Config | Download |
+| :------------: |  :---------: |  :---------: |:---------: | :---------: | :-----: |  :---: | :-----: |
+| FlashInternImage-T  |DINO| layer-wise lr    | ImageNet-1K  |     1x      |  54.7   |     [config](./detection/configs/coco/dino_4scale_flash_internimage_t_1x_coco.py)     | [ckpt](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/dino_4scale_flash_internimage_t_1x_coco.pth) \| [log](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/dino_4scale_flash_internimage_t_1x_coco.json) |
+| FlashInternImage-S  |DINO | layer-wise lr    | ImageNet-1K  |     1x      |  55.3   |    [config](./detection/configs/coco/dino_4scale_flash_internimage_s_1x_coco.py)     | [ckpt](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/dino_4scale_flash_internimage_s_1x_coco.pth) \| [log](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/dino_4scale_flash_internimage_s_1x_coco.log) |
+| FlashInternImage-B   |DINO| layer-wise lr    | ImageNet-1K  |     1x      |  56.0     | [config](./detection/configs/coco/dino_4scale_flash_internimage_b_1x_coco.py)     | [ckpt](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/dino_4scale_flash_internimage_b_1x_coco.pth) \| [log](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/dino_4scale_flash_internimage_b_1x_coco.log) |
+| FlashInternImage-L  |DINO | 0.1x backbone lr | ImageNet-22K |     1x      |  58.8     |  [config](./detection/configs/coco/dino_4scale_flash_internimage_l_1x_coco.py) | [ckpt](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/dino_4scale_flash_internimage_l_1x_coco.pth) \| [log](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/dino_4scale_flash_internimage_l_1x_coco.log) |
+</div>
+</details>
+<details>
+<summary> ADE20K Semantic Segmentation </summary>
+<br>
+<div>
+| backbone      |method | resolution | mIoU (ss/ms) | Config | Download            |
+|:--------------:|:----------:|:----------:|:-----------:|:-----------:|:----------:
+| FlashInternImage-T|UperNet  | 512x512    | 49.3 / 50.3   | [config](./segmentation/configs/ade20k/upernet_flash_internimage_t_512_160k_ade20k.py) | [ckpt](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/upernet_flash_internimage_t_512_160k_ade20k.pth) \| [log](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/upernet_flash_internimage_t_512_160k_ade20k.log)   | 
+| FlashInternImage-S |UperNet   | 512x512    | 50.6 / 51.6     | [config](./segmentation/configs/ade20k/upernet_flash_internimage_s_512_160k_ade20k.py)  | [ckpt](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/upernet_flash_internimage_s_512_160k_ade20k.pth) \| [log](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/upernet_flash_internimage_s_512_160k_ade20k.log)  | 
+| FlashInternImage-B |UperNet   | 512x512    | 52.0 / 52.6       | [config](./segmentation/configs/ade20k/upernet_flash_internimage_b_512_160k_ade20k.py) | [ckpt](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/upernet_flash_internimage_b_512_160k_ade20k.pth) \| [log](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/upernet_flash_internimage_s_512_160k_ade20k.log)  | 
+| FlashInternImage-L  |UperNet  | 640x640    | 55.6 / 56.0    | [config](./segmentation/configs/ade20k/upernet_flash_internimage_l_640_160k_ade20k.py)| [ckpt](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/upernet_flash_internimage_l_640_160k_ade20k.pth) \| [log](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/upernet_flash_internimage_l_640_160k_ade20k.log)  | 
+| backbone      |method | resolution | mIoU (ss) | Config | Download            |
+|:--------------:|:----------:|:----------:|:-----------:|:-----------:|:----------:
+| FlashInternImage-T  |Mask2Former| 512x512    | 51.2   | [config](./segmentation/configs/ade20k/mask2former_flash_internimage_t_512_160k_ade20k_ss.py) | [ckpt](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/mask2former_flash_internimage_t_512_160k_ade20k_ss.pth) \| [log](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/mask2former_flash_internimage_t_512_160k_ade20k_ss.log)   | 
+| FlashInternImage-S   |Mask2Former| 640x640    | 52.6     | [config](./segmentation/configs/ade20k/mask2former_flash_internimage_s_640_160k_ade20k_ss.py)  | [ckpt](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/mask2former_flash_internimage_s_640_160k_ade20k_ss.pth) \| [log](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/mask2former_flash_internimage_s_640_160k_ade20k_ss.log)  | 
+| FlashInternImage-B   |Mask2Former| 640x640    |  53.4       | [config](./segmentation/configs/ade20k/mask2former_flash_internimage_b_640_160k_ade20k_ss.py) | [ckpt](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/mask2former_flash_internimage_b_640_160k_ade20k_ss.pth) \| [log](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/mask2former_flash_internimage_b_640_160k_ade20k_ss.log)  | 
+| FlashInternImage-L   |Mask2Former| 640x640    | 56.7     | [config](./segmentation/configs/ade20k/mask2former_flash_internimage_l_640_160k_ade20k_ss.py)| [ckpt](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/mask2former_flash_internimage_l_640_160k_ade20k_ss.pth) \| [log](https://huggingface.co/OpenGVLab/DCNv4/resolve/main/mask2former_flash_internimage_l_640_160k_ade20k_ss.log)  | 
+</div>
+</details>
+## Citations
+If this work is helpful for your research, please consider citing the following BibTeX entry.
+```bibtex
+@article{xiong2024efficient,
+      title={Efficient Deformable ConvNets: Rethinking Dynamic and Sparse Operator for Vision Applications}, 
+      author={Yuwen Xiong and Zhiqi Li and Yuntao Chen and Feng Wang and Xizhou Zhu and Jiapeng Luo and Wenhai Wang and Tong Lu and Hongsheng Li and Yu Qiao and Lewei Lu and Jie Zhou and Jifeng Dai},
+      journal={arXiv preprint arXiv:2401.06197},
+      year={2024}
+}
+@article{wang2022internimage,
+  title={InternImage: Exploring Large-Scale Vision Foundation Models with Deformable Convolutions},
+  author={Wang, Wenhai and Dai, Jifeng and Chen, Zhe and Huang, Zhenhang and Li, Zhiqi and Zhu, Xizhou and Hu, Xiaowei and Lu, Tong and Lu, Lewei and Li, Hongsheng and others},
+  journal={arXiv preprint arXiv:2211.05778},
+  year={2022}
+}
+@inproceedings{zhu2022uni,
+  title={Uni-perceiver: Pre-training unified architecture for generic perception for zero-shot and few-shot tasks},
+  author={Zhu, Xizhou and Zhu, Jinguo and Li, Hao and Wu, Xiaoshi and Li, Hongsheng and Wang, Xiaohua and Dai, Jifeng},
+  booktitle={CVPR},
+  pages={16804--16815},
+  year={2022}
+}
+@article{zhu2022uni,
+  title={Uni-perceiver-moe: Learning sparse generalist models with conditional moes},
+  author={Zhu, Jinguo and Zhu, Xizhou and Wang, Wenhai and Wang, Xiaohua and Li, Hongsheng and Wang, Xiaogang and Dai, Jifeng},
+  journal={arXiv preprint arXiv:2206.04674},
+  year={2022}
+}
+@article{li2022uni,
+  title={Uni-Perceiver v2: A Generalist Model for Large-Scale Vision and Vision-Language Tasks},
+  author={Li, Hao and Zhu, Jinguo and Jiang, Xiaohu and Zhu, Xizhou and Li, Hongsheng and Yuan, Chun and Wang, Xiaohua and Qiao, Yu and Wang, Xiaogang and Wang, Wenhai and others},
+  journal={arXiv preprint arXiv:2211.09808},
+  year={2022}
+}
+@article{yang2022bevformer,
+  title={BEVFormer v2: Adapting Modern Image Backbones to Bird's-Eye-View Recognition via Perspective Supervision},
+  author={Yang, Chenyu and Chen, Yuntao and Tian, Hao and Tao, Chenxin and Zhu, Xizhou and Zhang, Zhaoxiang and Huang, Gao and Li, Hongyang and Qiao, Yu and Lu, Lewei and others},
+  journal={arXiv preprint arXiv:2211.10439},
+  year={2022}
+}
+@article{su2022towards,
+  title={Towards All-in-one Pre-training via Maximizing Multi-modal Mutual Information},
+  author={Su, Weijie and Zhu, Xizhou and Tao, Chenxin and Lu, Lewei and Li, Bin and Huang, Gao and Qiao, Yu and Wang, Xiaogang and Zhou, Jie and Dai, Jifeng},
+  journal={arXiv preprint arXiv:2211.09807},
+  year={2022}
+}
+@inproceedings{li2022bevformer,
+  title={Bevformer: Learning bird’s-eye-view representation from multi-camera images via spatiotemporal transformers},
+  author={Li, Zhiqi and Wang, Wenhai and Li, Hongyang and Xie, Enze and Sima, Chonghao and Lu, Tong and Qiao, Yu and Dai, Jifeng},
+  booktitle={ECCV},
+  pages={1--18},
+  year={2022},
+}
+```