Unverified Commit 6ce278bb authored by vfdev's avatar vfdev Committed by GitHub
Browse files

Fixes deform_conv issue with large input/output (#4351)



* WIP on fixing index overflow issue

* Fixed backward pass for large num_kernels

* Fixed clang formatting

* Fixed GET_BLOCKS int/int64_t types issue
Co-authored-by: default avatarvfdev-5 <vfdev-5@gmail.com>
Co-authored-by: default avatarFrancisco Massa <fvsmassa@gmail.com>
parent d9e6d60f
...@@ -3,10 +3,12 @@ ...@@ -3,10 +3,12 @@
namespace vision { namespace vision {
namespace ops { namespace ops {
#define CUDA_1D_KERNEL_LOOP(i, n) \ #define CUDA_1D_KERNEL_LOOP_T(i, n, index_t) \
for (int i = (blockIdx.x * blockDim.x) + threadIdx.x; i < (n); \ for (index_t i = (blockIdx.x * blockDim.x) + threadIdx.x; i < (n); \
i += (blockDim.x * gridDim.x)) i += (blockDim.x * gridDim.x))
#define CUDA_1D_KERNEL_LOOP(i, n) CUDA_1D_KERNEL_LOOP_T(i, n, int)
template <typename integer> template <typename integer>
constexpr __host__ __device__ inline integer ceil_div(integer n, integer m) { constexpr __host__ __device__ inline integer ceil_div(integer n, integer m) {
return (n + m - 1) / m; return (n + m - 1) / m;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment