Commit 108fc9e1 authored by Kai Chen's avatar Kai Chen
Browse files

set up the codebase skeleton (WIP)

parent 6985ef31
import torch
from ._functions import Scatter
from torch.nn.parallel._functions import Scatter as OrigScatter
from detkit.datasets.utils import DataContainer
def scatter(inputs, target_gpus, dim=0):
"""Scatter inputs to target gpus.
The only difference from original :func:`scatter` is to add support for
:type:`~mmdet.DataContainer`.
"""
def scatter_map(obj):
if isinstance(obj, torch.Tensor):
return OrigScatter.apply(target_gpus, None, dim, obj)
if isinstance(obj, DataContainer) and isinstance(obj.data, list):
return Scatter.forward(target_gpus, obj.data)
if isinstance(obj, tuple) and len(obj) > 0:
return list(zip(*map(scatter_map, obj)))
if isinstance(obj, list) and len(obj) > 0:
return list(map(list, zip(*map(scatter_map, obj))))
if isinstance(obj, dict) and len(obj) > 0:
return list(map(type(obj), zip(*map(scatter_map, obj.items()))))
return [obj for targets in target_gpus]
# After scatter_map is called, a scatter_map cell will exist. This cell
# has a reference to the actual function scatter_map, which has references
# to a closure that has a reference to the scatter_map cell (because the
# fn is recursive). To avoid this reference cycle, we set the function to
# None, clearing the cell
try:
return scatter_map(inputs)
finally:
scatter_map = None
def scatter_kwargs(inputs, kwargs, target_gpus, dim=0):
"""Scatter with support for kwargs dictionary"""
inputs = scatter(inputs, target_gpus, dim) if inputs else []
kwargs = scatter(kwargs, target_gpus, dim) if kwargs else []
if len(inputs) < len(kwargs):
inputs.extend([() for _ in range(len(kwargs) - len(inputs))])
elif len(kwargs) < len(inputs):
kwargs.extend([{} for _ in range(len(inputs) - len(kwargs))])
inputs = tuple(inputs)
kwargs = tuple(kwargs)
return inputs, kwargs
from .nms import nms, soft_nms
from .roi_align import RoIAlign, roi_align
from .roi_pool import RoIPool, roi_pool
PYTHON=${PYTHON:-python}
all:
echo "Compiling nms kernels..."
$(PYTHON) setup.py build_ext --inplace
clean:
rm *.so
from .nms_wrapper import nms, soft_nms
# --------------------------------------------------------
# Fast R-CNN
# Copyright (c) 2015 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Ross Girshick
# --------------------------------------------------------
import numpy as np
cimport numpy as np
cdef inline np.float32_t max(np.float32_t a, np.float32_t b):
return a if a >= b else b
cdef inline np.float32_t min(np.float32_t a, np.float32_t b):
return a if a <= b else b
def cpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh):
cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0]
cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1]
cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2]
cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3]
cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4]
cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1)
cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1]
cdef int ndets = dets.shape[0]
cdef np.ndarray[np.int_t, ndim=1] suppressed = \
np.zeros((ndets), dtype=np.int)
# nominal indices
cdef int _i, _j
# sorted indices
cdef int i, j
# temp variables for box i's (the box currently under consideration)
cdef np.float32_t ix1, iy1, ix2, iy2, iarea
# variables for computing overlap with box j (lower scoring box)
cdef np.float32_t xx1, yy1, xx2, yy2
cdef np.float32_t w, h
cdef np.float32_t inter, ovr
keep = []
for _i in range(ndets):
i = order[_i]
if suppressed[i] == 1:
continue
keep.append(i)
ix1 = x1[i]
iy1 = y1[i]
ix2 = x2[i]
iy2 = y2[i]
iarea = areas[i]
for _j in range(_i + 1, ndets):
j = order[_j]
if suppressed[j] == 1:
continue
xx1 = max(ix1, x1[j])
yy1 = max(iy1, y1[j])
xx2 = min(ix2, x2[j])
yy2 = min(iy2, y2[j])
w = max(0.0, xx2 - xx1 + 1)
h = max(0.0, yy2 - yy1 + 1)
inter = w * h
ovr = inter / (iarea + areas[j] - inter)
if ovr >= thresh:
suppressed[j] = 1
return keep
# ----------------------------------------------------------
# Soft-NMS: Improving Object Detection With One Line of Code
# Copyright (c) University of Maryland, College Park
# Licensed under The MIT License [see LICENSE for details]
# Written by Navaneeth Bodla and Bharat Singh
# ----------------------------------------------------------
import numpy as np
cimport numpy as np
cdef inline np.float32_t max(np.float32_t a, np.float32_t b):
return a if a >= b else b
cdef inline np.float32_t min(np.float32_t a, np.float32_t b):
return a if a <= b else b
def cpu_soft_nms(
np.ndarray[float, ndim=2] boxes_in,
float sigma=0.5,
float Nt=0.3,
float threshold=0.001,
unsigned int method=0
):
boxes = boxes_in.copy()
cdef unsigned int N = boxes.shape[0]
cdef float iw, ih, box_area
cdef float ua
cdef int pos = 0
cdef float maxscore = 0
cdef int maxpos = 0
cdef float x1, x2, y1, y2, tx1, tx2, ty1, ty2, ts, area, weight, ov
inds = np.arange(N)
for i in range(N):
maxscore = boxes[i, 4]
maxpos = i
tx1 = boxes[i,0]
ty1 = boxes[i,1]
tx2 = boxes[i,2]
ty2 = boxes[i,3]
ts = boxes[i,4]
ti = inds[i]
pos = i + 1
# get max box
while pos < N:
if maxscore < boxes[pos, 4]:
maxscore = boxes[pos, 4]
maxpos = pos
pos = pos + 1
# add max box as a detection
boxes[i,0] = boxes[maxpos,0]
boxes[i,1] = boxes[maxpos,1]
boxes[i,2] = boxes[maxpos,2]
boxes[i,3] = boxes[maxpos,3]
boxes[i,4] = boxes[maxpos,4]
inds[i] = inds[maxpos]
# swap ith box with position of max box
boxes[maxpos,0] = tx1
boxes[maxpos,1] = ty1
boxes[maxpos,2] = tx2
boxes[maxpos,3] = ty2
boxes[maxpos,4] = ts
inds[maxpos] = ti
tx1 = boxes[i,0]
ty1 = boxes[i,1]
tx2 = boxes[i,2]
ty2 = boxes[i,3]
ts = boxes[i,4]
pos = i + 1
# NMS iterations, note that N changes if detection boxes fall below
# threshold
while pos < N:
x1 = boxes[pos, 0]
y1 = boxes[pos, 1]
x2 = boxes[pos, 2]
y2 = boxes[pos, 3]
s = boxes[pos, 4]
area = (x2 - x1 + 1) * (y2 - y1 + 1)
iw = (min(tx2, x2) - max(tx1, x1) + 1)
if iw > 0:
ih = (min(ty2, y2) - max(ty1, y1) + 1)
if ih > 0:
ua = float((tx2 - tx1 + 1) * (ty2 - ty1 + 1) + area - iw * ih)
ov = iw * ih / ua #iou between max box and detection box
if method == 1: # linear
if ov > Nt:
weight = 1 - ov
else:
weight = 1
elif method == 2: # gaussian
weight = np.exp(-(ov * ov)/sigma)
else: # original NMS
if ov > Nt:
weight = 0
else:
weight = 1
boxes[pos, 4] = weight*boxes[pos, 4]
# if box score falls below threshold, discard the box by
# swapping with last box update N
if boxes[pos, 4] < threshold:
boxes[pos,0] = boxes[N-1, 0]
boxes[pos,1] = boxes[N-1, 1]
boxes[pos,2] = boxes[N-1, 2]
boxes[pos,3] = boxes[N-1, 3]
boxes[pos,4] = boxes[N-1, 4]
inds[pos] = inds[N-1]
N = N - 1
pos = pos - 1
pos = pos + 1
return boxes[:N], inds[:N]
\ No newline at end of file
void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num,
int boxes_dim, float nms_overlap_thresh, int device_id, size_t base);
size_t nms_Malloc();
# --------------------------------------------------------
# Faster R-CNN
# Copyright (c) 2015 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Ross Girshick
# --------------------------------------------------------
import numpy as np
cimport numpy as np
assert sizeof(int) == sizeof(np.int32_t)
cdef extern from "gpu_nms.hpp":
void _nms(np.int32_t*, int*, np.float32_t*, int, int, float, int, size_t) nogil
size_t nms_Malloc() nogil
memory_pool = {}
def gpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh,
np.int32_t device_id=0):
cdef int boxes_num = dets.shape[0]
cdef int boxes_dim = dets.shape[1]
cdef int num_out
cdef size_t base
cdef np.ndarray[np.int32_t, ndim=1] \
keep = np.zeros(boxes_num, dtype=np.int32)
cdef np.ndarray[np.float32_t, ndim=1] \
scores = dets[:, 4]
cdef np.ndarray[np.int_t, ndim=1] \
order = scores.argsort()[::-1]
cdef np.ndarray[np.float32_t, ndim=2] \
sorted_dets = dets[order, :]
cdef float cthresh = thresh
if device_id not in memory_pool:
with nogil:
base = nms_Malloc()
memory_pool[device_id] = base
# print "malloc", base
base = memory_pool[device_id]
with nogil:
_nms(&keep[0], &num_out, &sorted_dets[0, 0], boxes_num, boxes_dim, cthresh, device_id, base)
keep = keep[:num_out]
return list(order[keep])
// ------------------------------------------------------------------
// Faster R-CNN
// Copyright (c) 2015 Microsoft
// Licensed under The MIT License [see fast-rcnn/LICENSE for details]
// Written by Shaoqing Ren
// ------------------------------------------------------------------
#include <stdio.h>
#include <iostream>
#include <vector>
#include "gpu_nms.hpp"
#define CUDA_CHECK(condition) \
/* Code block avoids redefinition of cudaError_t error */ \
do { \
cudaError_t error = condition; \
if (error != cudaSuccess) { \
std::cout << cudaGetErrorString(error) << std::endl; \
} \
} while (0)
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
#define MULTIPLIER 16
#define LONGLONG_SIZE 64
int const threadsPerBlock =
sizeof(unsigned long long) * 8 *
MULTIPLIER; // number of bits for a long long variable
__device__ inline float devIoU(float const* const a, float const* const b) {
float left = max(a[0], b[0]), right = min(a[2], b[2]);
float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
float width = max(right - left + 1, 0.f),
height = max(bottom - top + 1, 0.f);
float interS = width * height;
float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
return interS / (Sa + Sb - interS);
}
__global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
const float* dev_boxes,
unsigned long long* dev_mask) {
const int row_start = blockIdx.y;
const int col_start = blockIdx.x;
// if (row_start > col_start) return;
const int row_size =
min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
const int col_size =
min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
__shared__ float block_boxes[threadsPerBlock * 5];
if (threadIdx.x < col_size) {
block_boxes[threadIdx.x * 5 + 0] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
block_boxes[threadIdx.x * 5 + 1] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
block_boxes[threadIdx.x * 5 + 2] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
block_boxes[threadIdx.x * 5 + 3] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
block_boxes[threadIdx.x * 5 + 4] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
}
__syncthreads();
unsigned long long ts[MULTIPLIER];
if (threadIdx.x < row_size) {
#pragma unroll
for (int i = 0; i < MULTIPLIER; ++i) {
ts[i] = 0;
}
const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
const float* cur_box = dev_boxes + cur_box_idx * 5;
int i = 0;
int start = 0;
if (row_start == col_start) {
start = threadIdx.x + 1;
}
for (i = start; i < col_size; i++) {
if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
ts[i / LONGLONG_SIZE] |= 1ULL << (i % LONGLONG_SIZE);
}
}
const int col_blocks = DIVUP(n_boxes, threadsPerBlock);
#pragma unroll
for (int i = 0; i < MULTIPLIER; ++i) {
dev_mask[(cur_box_idx * col_blocks + col_start) * MULTIPLIER + i] =
ts[i];
}
}
}
void _set_device(int device_id) {
int current_device;
CUDA_CHECK(cudaGetDevice(&current_device));
if (current_device == device_id) {
return;
}
// The call to cudaSetDevice must come before any calls to Get, which
// may perform initialization using the GPU.
CUDA_CHECK(cudaSetDevice(device_id));
}
const size_t MEMORY_SIZE = 500000000;
size_t nms_Malloc() {
float* boxes_dev = NULL;
CUDA_CHECK(cudaMalloc(&boxes_dev, MEMORY_SIZE));
return size_t(boxes_dev);
}
void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num,
int boxes_dim, float nms_overlap_thresh, int device_id, size_t base) {
_set_device(device_id);
float* boxes_dev = NULL;
unsigned long long* mask_dev = NULL;
const int col_blocks = DIVUP(boxes_num, threadsPerBlock);
if (base > 0) {
size_t require_mem =
boxes_num * boxes_dim * sizeof(float) +
boxes_num * col_blocks * sizeof(unsigned long long) * MULTIPLIER;
if (require_mem >= MEMORY_SIZE) {
std::cout << "require_mem: " << require_mem << std::endl;
}
boxes_dev = (float*)(base);
mask_dev =
(unsigned long long*)(base +
512 * ((unsigned long long)(boxes_num *
boxes_dim *
sizeof(float) /
512) +
1));
} else {
CUDA_CHECK(
cudaMalloc(&boxes_dev, boxes_num * boxes_dim * sizeof(float)));
CUDA_CHECK(cudaMalloc(&mask_dev, MULTIPLIER * boxes_num * col_blocks *
sizeof(unsigned long long)));
}
CUDA_CHECK(cudaMemcpy(boxes_dev, boxes_host,
boxes_num * boxes_dim * sizeof(float),
cudaMemcpyHostToDevice));
dim3 blocks(DIVUP(boxes_num, threadsPerBlock),
DIVUP(boxes_num, threadsPerBlock));
dim3 threads(threadsPerBlock);
nms_kernel<<<blocks, threads>>>(boxes_num, nms_overlap_thresh, boxes_dev,
mask_dev);
std::vector<unsigned long long> mask_host(boxes_num * col_blocks *
MULTIPLIER);
CUDA_CHECK(cudaMemcpy(
&mask_host[0], mask_dev,
sizeof(unsigned long long) * boxes_num * col_blocks * MULTIPLIER,
cudaMemcpyDeviceToHost));
std::vector<unsigned long long> remv(col_blocks * MULTIPLIER);
memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks * MULTIPLIER);
int num_to_keep = 0;
for (int i = 0; i < boxes_num; i++) {
int nblock = i / threadsPerBlock;
int inblock = i % threadsPerBlock;
int offset = inblock / LONGLONG_SIZE;
int bit_pos = inblock % LONGLONG_SIZE;
if (!(remv[nblock * MULTIPLIER + offset] & (1ULL << bit_pos))) {
keep_out[num_to_keep++] = i;
unsigned long long* p = &mask_host[0] + i * col_blocks * MULTIPLIER;
for (int j = nblock * MULTIPLIER + offset;
j < col_blocks * MULTIPLIER; j++) {
remv[j] |= p[j];
}
}
}
*num_out = num_to_keep;
if (!base) {
CUDA_CHECK(cudaFree(boxes_dev));
CUDA_CHECK(cudaFree(mask_dev));
}
}
import numpy as np
import torch
from .gpu_nms import gpu_nms
from .cpu_nms import cpu_nms
from .cpu_soft_nms import cpu_soft_nms
def nms(dets, thresh, device_id=None):
"""Dispatch to either CPU or GPU NMS implementations."""
if isinstance(dets, torch.Tensor):
if dets.is_cuda:
device_id = dets.get_device()
dets = dets.detach().cpu().numpy()
assert isinstance(dets, np.ndarray)
if dets.shape[0] == 0:
inds = []
else:
inds = (gpu_nms(dets, thresh, device_id=device_id)
if device_id is not None else cpu_nms(dets, thresh))
if isinstance(dets, torch.Tensor):
return dets.new_tensor(inds, dtype=torch.long)
else:
return np.array(inds, dtype=np.int)
def soft_nms(dets, Nt=0.3, method=1, sigma=0.5, min_score=0):
if isinstance(dets, torch.Tensor):
_dets = dets.detach().cpu().numpy()
else:
_dets = dets.copy()
assert isinstance(_dets, np.ndarray)
new_dets, inds = cpu_soft_nms(
_dets, Nt=Nt, method=method, sigma=sigma, threshold=min_score)
if isinstance(dets, torch.Tensor):
return dets.new_tensor(
inds, dtype=torch.long), dets.new_tensor(new_dets)
else:
return np.array(
inds, dtype=np.int), np.array(
new_dets, dtype=np.float32)
import os
from distutils.core import setup
from distutils.extension import Extension
import numpy as np
from Cython.Build import cythonize
from Cython.Distutils import build_ext
CUDA_ROOT = '/usr/local/cuda'
CUDA = {
"include": os.path.join(CUDA_ROOT, 'include'),
"lib": os.path.join(CUDA_ROOT, 'lib64'),
"nvcc": os.path.join(CUDA_ROOT, 'bin', "nvcc")
}
inc_dirs = [CUDA['include'], np.get_include()]
lib_dirs = [CUDA['lib']]
# extensions
ext_args = dict(
include_dirs=inc_dirs,
library_dirs=lib_dirs,
language='c++',
libraries=['cudart'],
extra_compile_args={
"cc": ['-Wno-unused-function', '-Wno-write-strings'],
"nvcc": [
'-arch=sm_52', '--ptxas-options=-v', '-c', '--compiler-options',
'-fPIC'
],
},
)
extensions = [
Extension('cpu_nms', ['cpu_nms.pyx'], **ext_args),
Extension('gpu_nms', ['gpu_nms.pyx', 'nms_kernel.cu'], **ext_args),
Extension('cpu_soft_nms', ['cpu_soft_nms.pyx'], **ext_args),
]
def customize_compiler_for_nvcc(self):
"""inject deep into distutils to customize how the dispatch
to cc/nvcc works.
If you subclass UnixCCompiler, it's not trivial to get your subclass
injected in, and still have the right customizations (i.e.
distutils.sysconfig.customize_compiler) run on it. So instead of going
the OO route, I have this. Note, it's kindof like a wierd functional
subclassing going on."""
# tell the compiler it can processes .cu
self.src_extensions.append('.cu')
# save references to the default compiler_so and _comple methods
default_compiler_so = self.compiler_so
super = self._compile
# now redefine the _compile method. This gets executed for each
# object but distutils doesn't have the ability to change compilers
# based on source extension: we add it.
def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts):
if os.path.splitext(src)[1] == '.cu':
# use the cuda for .cu files
self.set_executable('compiler_so', CUDA['nvcc'])
# use only a subset of the extra_postargs, which are 1-1 translated
# from the extra_compile_args in the Extension class
postargs = extra_postargs['nvcc']
else:
postargs = extra_postargs['cc']
super(obj, src, ext, cc_args, postargs, pp_opts)
# reset the default compiler_so, which we might have changed for cuda
self.compiler_so = default_compiler_so
# inject our redefined _compile method into the class
self._compile = _compile
# run the customize_compiler
class custom_build_ext(build_ext):
def build_extensions(self):
customize_compiler_for_nvcc(self.compiler)
build_ext.build_extensions(self)
setup(
name='nms',
cmdclass={'build_ext': custom_build_ext},
ext_modules=cythonize(extensions),
)
from .functions.roi_align import roi_align
from .modules.roi_align import RoIAlign
from torch.autograd import Function, Variable
from .. import roi_align_cuda
class RoIAlignFunction(Function):
@staticmethod
def forward(ctx, features, rois, out_size, spatial_scale, sample_num=0):
if isinstance(out_size, int):
out_h = out_size
out_w = out_size
elif isinstance(out_size, tuple):
assert len(out_size) == 2
assert isinstance(out_size[0], int)
assert isinstance(out_size[1], int)
out_h, out_w = out_size
else:
raise TypeError(
'"out_size" must be an integer or tuple of integers')
ctx.spatial_scale = spatial_scale
ctx.sample_num = sample_num
ctx.save_for_backward(rois)
ctx.feature_size = features.size()
batch_size, num_channels, data_height, data_width = features.size()
num_rois = rois.size(0)
output = features.new_zeros(num_rois, num_channels, out_h, out_w)
if features.is_cuda:
roi_align_cuda.forward(features, rois, out_h, out_w, spatial_scale,
sample_num, output)
else:
raise NotImplementedError
return output
@staticmethod
def backward(ctx, grad_output):
feature_size = ctx.feature_size
spatial_scale = ctx.spatial_scale
sample_num = ctx.sample_num
rois = ctx.saved_tensors[0]
assert (feature_size is not None and grad_output.is_cuda)
batch_size, num_channels, data_height, data_width = feature_size
out_w = grad_output.size(3)
out_h = grad_output.size(2)
grad_input = grad_rois = None
if ctx.needs_input_grad[0]:
grad_input = Variable(
rois.new(batch_size, num_channels, data_height, data_width)
.zero_())
roi_align_cuda.backward(grad_output, rois, out_h, out_w,
spatial_scale, sample_num, grad_input)
return grad_input, grad_rois, None, None, None
roi_align = RoIAlignFunction.apply
import numpy as np
import torch
from torch.autograd import gradcheck
import os.path as osp
import sys
sys.path.append(osp.abspath(osp.join(__file__, '../../')))
from roi_align import RoIAlign
feat_size = 15
spatial_scale = 1.0 / 8
img_size = feat_size / spatial_scale
num_imgs = 2
num_rois = 20
batch_ind = np.random.randint(num_imgs, size=(num_rois, 1))
rois = np.random.rand(num_rois, 4) * img_size * 0.5
rois[:, 2:] += img_size * 0.5
rois = np.hstack((batch_ind, rois))
feat = torch.randn(
num_imgs, 16, feat_size, feat_size, requires_grad=True, device='cuda:0')
rois = torch.from_numpy(rois).float().cuda()
inputs = (feat, rois)
print('Gradcheck for roi align...')
test = gradcheck(RoIAlign(3, spatial_scale), inputs, atol=1e-3, eps=1e-3)
print(test)
test = gradcheck(RoIAlign(3, spatial_scale, 2), inputs, atol=1e-3, eps=1e-3)
print(test)
from torch.nn.modules.module import Module
from ..functions.roi_align import RoIAlignFunction
class RoIAlign(Module):
def __init__(self, out_size, spatial_scale, sample_num=0):
super(RoIAlign, self).__init__()
self.out_size = out_size
self.spatial_scale = float(spatial_scale)
self.sample_num = int(sample_num)
def forward(self, features, rois):
return RoIAlignFunction.apply(features, rois, self.out_size,
self.spatial_scale, self.sample_num)
from setuptools import setup
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
setup(
name='roi_align_cuda',
ext_modules=[
CUDAExtension('roi_align_cuda', [
'src/roi_align_cuda.cpp',
'src/roi_align_kernel.cu',
]),
],
cmdclass={'build_ext': BuildExtension})
#include <torch/torch.h>
#include <cmath>
#include <vector>
int ROIAlignForwardLaucher(const at::Tensor features, const at::Tensor rois,
const float spatial_scale, const int sample_num,
const int channels, const int height,
const int width, const int num_rois,
const int pooled_height, const int pooled_width,
at::Tensor output);
int ROIAlignBackwardLaucher(const at::Tensor top_grad, const at::Tensor rois,
const float spatial_scale, const int sample_num,
const int channels, const int height,
const int width, const int num_rois,
const int pooled_height, const int pooled_width,
at::Tensor bottom_grad);
#define CHECK_CUDA(x) AT_ASSERT(x.type().is_cuda(), #x " must be a CUDAtensor ")
#define CHECK_CONTIGUOUS(x) \
AT_ASSERT(x.is_contiguous(), #x " must be contiguous ")
#define CHECK_INPUT(x) \
CHECK_CUDA(x); \
CHECK_CONTIGUOUS(x)
int roi_align_forward_cuda(at::Tensor features, at::Tensor rois,
int pooled_height, int pooled_width,
float spatial_scale, int sample_num,
at::Tensor output) {
CHECK_INPUT(features);
CHECK_INPUT(rois);
CHECK_INPUT(output);
// Number of ROIs
int num_rois = rois.size(0);
int size_rois = rois.size(1);
if (size_rois != 5) {
printf("wrong roi size\n");
return 0;
}
int num_channels = features.size(1);
int data_height = features.size(2);
int data_width = features.size(3);
ROIAlignForwardLaucher(features, rois, spatial_scale, sample_num,
num_channels, data_height, data_width, num_rois,
pooled_height, pooled_width, output);
return 1;
}
int roi_align_backward_cuda(at::Tensor top_grad, at::Tensor rois,
int pooled_height, int pooled_width,
float spatial_scale, int sample_num,
at::Tensor bottom_grad) {
CHECK_INPUT(top_grad);
CHECK_INPUT(rois);
CHECK_INPUT(bottom_grad);
// Number of ROIs
int num_rois = rois.size(0);
int size_rois = rois.size(1);
if (size_rois != 5) {
printf("wrong roi size\n");
return 0;
}
int num_channels = bottom_grad.size(1);
int data_height = bottom_grad.size(2);
int data_width = bottom_grad.size(3);
ROIAlignBackwardLaucher(top_grad, rois, spatial_scale, sample_num,
num_channels, data_height, data_width, num_rois,
pooled_height, pooled_width, bottom_grad);
return 1;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &roi_align_forward_cuda, "Roi_Align forward (CUDA)");
m.def("backward", &roi_align_backward_cuda, "Roi_Align backward (CUDA)");
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment