"git@developer.sourcefind.cn:hehl2/torchaudio.git" did not exist on "9c274228ccac862e2f330bed1fd7c0b3d8b05c67"
Unverified Commit e421e832 authored by Kai Chen's avatar Kai Chen Committed by GitHub
Browse files

port nms extension from maskrcnn-benchmark (#469)

* port nms extension from maskrcnn-benchmark

* fix linting error
parent ee7e679a
...@@ -104,7 +104,7 @@ venv.bak/ ...@@ -104,7 +104,7 @@ venv.bak/
.mypy_cache/ .mypy_cache/
# cython generated cpp # cython generated cpp
mmdet/ops/nms/*.cpp mmdet/ops/nms/src/soft_nms_cpu.cpp
mmdet/version.py mmdet/version.py
data data
.vscode .vscode
......
...@@ -18,8 +18,10 @@ $PYTHON setup.py build_ext --inplace ...@@ -18,8 +18,10 @@ $PYTHON setup.py build_ext --inplace
echo "Building nms op..." echo "Building nms op..."
cd ../nms cd ../nms
make clean if [ -d "build" ]; then
make PYTHON=${PYTHON} rm -r build
fi
$PYTHON setup.py build_ext --inplace
echo "Building dcn..." echo "Building dcn..."
cd ../dcn cd ../dcn
......
PYTHON=${PYTHON:-python}
all:
echo "Compiling nms kernels..."
$(PYTHON) setup.py build_ext --inplace
clean:
rm -f *.so
# --------------------------------------------------------
# Fast R-CNN
# Copyright (c) 2015 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Ross Girshick
# --------------------------------------------------------
# cython: language_level=3, boundscheck=False
import numpy as np
cimport numpy as np
cdef inline np.float32_t max(np.float32_t a, np.float32_t b):
return a if a >= b else b
cdef inline np.float32_t min(np.float32_t a, np.float32_t b):
return a if a <= b else b
def cpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh):
cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0]
cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1]
cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2]
cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3]
cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4]
cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1)
cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1]
cdef int ndets = dets.shape[0]
cdef np.ndarray[np.int_t, ndim=1] suppressed = \
np.zeros((ndets), dtype=np.int)
# nominal indices
cdef int _i, _j
# sorted indices
cdef int i, j
# temp variables for box i's (the box currently under consideration)
cdef np.float32_t ix1, iy1, ix2, iy2, iarea
# variables for computing overlap with box j (lower scoring box)
cdef np.float32_t xx1, yy1, xx2, yy2
cdef np.float32_t w, h
cdef np.float32_t inter, ovr
keep = []
for _i in range(ndets):
i = order[_i]
if suppressed[i] == 1:
continue
keep.append(i)
ix1 = x1[i]
iy1 = y1[i]
ix2 = x2[i]
iy2 = y2[i]
iarea = areas[i]
for _j in range(_i + 1, ndets):
j = order[_j]
if suppressed[j] == 1:
continue
xx1 = max(ix1, x1[j])
yy1 = max(iy1, y1[j])
xx2 = min(ix2, x2[j])
yy2 = min(iy2, y2[j])
w = max(0.0, xx2 - xx1 + 1)
h = max(0.0, yy2 - yy1 + 1)
inter = w * h
ovr = inter / (iarea + areas[j] - inter)
if ovr >= thresh:
suppressed[j] = 1
return keep
void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num,
int boxes_dim, float nms_overlap_thresh, int device_id, size_t base);
size_t nms_Malloc();
# --------------------------------------------------------
# Faster R-CNN
# Copyright (c) 2015 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Ross Girshick
# --------------------------------------------------------
# cython: language_level=3, boundscheck=False
import numpy as np
cimport numpy as np
assert sizeof(int) == sizeof(np.int32_t)
cdef extern from "gpu_nms.hpp":
void _nms(np.int32_t*, int*, np.float32_t*, int, int, float, int, size_t) nogil
size_t nms_Malloc() nogil
memory_pool = {}
def gpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh,
np.int32_t device_id=0):
cdef int boxes_num = dets.shape[0]
cdef int boxes_dim = 5
cdef int num_out
cdef size_t base
cdef np.ndarray[np.int32_t, ndim=1] \
keep = np.zeros(boxes_num, dtype=np.int32)
cdef np.ndarray[np.float32_t, ndim=1] \
scores = dets[:, 4]
cdef np.ndarray[np.int_t, ndim=1] \
order = scores.argsort()[::-1]
cdef np.ndarray[np.float32_t, ndim=2] \
sorted_dets = dets[order, :5]
cdef float cthresh = thresh
if device_id not in memory_pool:
with nogil:
base = nms_Malloc()
memory_pool[device_id] = base
# print "malloc", base
base = memory_pool[device_id]
with nogil:
_nms(&keep[0], &num_out, &sorted_dets[0, 0], boxes_num, boxes_dim, cthresh, device_id, base)
keep = keep[:num_out]
return list(order[keep])
// ------------------------------------------------------------------
// Faster R-CNN
// Copyright (c) 2015 Microsoft
// Licensed under The MIT License [see fast-rcnn/LICENSE for details]
// Written by Shaoqing Ren
// ------------------------------------------------------------------
#include <stdio.h>
#include <iostream>
#include <vector>
#include "gpu_nms.hpp"
#define CUDA_CHECK(condition) \
/* Code block avoids redefinition of cudaError_t error */ \
do { \
cudaError_t error = condition; \
if (error != cudaSuccess) { \
std::cout << cudaGetErrorString(error) << std::endl; \
} \
} while (0)
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
#define MULTIPLIER 16
#define LONGLONG_SIZE 64
int const threadsPerBlock =
sizeof(unsigned long long) * 8 *
MULTIPLIER; // number of bits for a long long variable
__device__ inline float devIoU(float const* const a, float const* const b) {
float left = max(a[0], b[0]), right = min(a[2], b[2]);
float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
float width = max(right - left + 1, 0.f),
height = max(bottom - top + 1, 0.f);
float interS = width * height;
float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
return interS / (Sa + Sb - interS);
}
__global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
const float* dev_boxes,
unsigned long long* dev_mask) {
const int row_start = blockIdx.y;
const int col_start = blockIdx.x;
// if (row_start > col_start) return;
const int row_size =
min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
const int col_size =
min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
__shared__ float block_boxes[threadsPerBlock * 5];
if (threadIdx.x < col_size) {
block_boxes[threadIdx.x * 5 + 0] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
block_boxes[threadIdx.x * 5 + 1] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
block_boxes[threadIdx.x * 5 + 2] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
block_boxes[threadIdx.x * 5 + 3] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
block_boxes[threadIdx.x * 5 + 4] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
}
__syncthreads();
unsigned long long ts[MULTIPLIER];
if (threadIdx.x < row_size) {
#pragma unroll
for (int i = 0; i < MULTIPLIER; ++i) {
ts[i] = 0;
}
const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
const float* cur_box = dev_boxes + cur_box_idx * 5;
int i = 0;
int start = 0;
if (row_start == col_start) {
start = threadIdx.x + 1;
}
for (i = start; i < col_size; i++) {
if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
ts[i / LONGLONG_SIZE] |= 1ULL << (i % LONGLONG_SIZE);
}
}
const int col_blocks = DIVUP(n_boxes, threadsPerBlock);
#pragma unroll
for (int i = 0; i < MULTIPLIER; ++i) {
dev_mask[(cur_box_idx * col_blocks + col_start) * MULTIPLIER + i] =
ts[i];
}
}
}
void _set_device(int device_id) {
int current_device;
CUDA_CHECK(cudaGetDevice(&current_device));
if (current_device == device_id) {
return;
}
// The call to cudaSetDevice must come before any calls to Get, which
// may perform initialization using the GPU.
CUDA_CHECK(cudaSetDevice(device_id));
}
const size_t MEMORY_SIZE = 500000000;
size_t nms_Malloc() {
float* boxes_dev = NULL;
CUDA_CHECK(cudaMalloc(&boxes_dev, MEMORY_SIZE));
return size_t(boxes_dev);
}
void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num,
int boxes_dim, float nms_overlap_thresh, int device_id, size_t base) {
_set_device(device_id);
float* boxes_dev = NULL;
unsigned long long* mask_dev = NULL;
const int col_blocks = DIVUP(boxes_num, threadsPerBlock);
if (base > 0) {
size_t require_mem =
boxes_num * boxes_dim * sizeof(float) +
boxes_num * col_blocks * sizeof(unsigned long long) * MULTIPLIER;
if (require_mem >= MEMORY_SIZE) {
std::cout << "require_mem: " << require_mem << std::endl;
}
boxes_dev = (float*)(base);
mask_dev =
(unsigned long long*)(base +
512 * ((unsigned long long)(boxes_num *
boxes_dim *
sizeof(float) /
512) +
1));
} else {
CUDA_CHECK(
cudaMalloc(&boxes_dev, boxes_num * boxes_dim * sizeof(float)));
CUDA_CHECK(cudaMalloc(&mask_dev, MULTIPLIER * boxes_num * col_blocks *
sizeof(unsigned long long)));
}
CUDA_CHECK(cudaMemcpy(boxes_dev, boxes_host,
boxes_num * boxes_dim * sizeof(float),
cudaMemcpyHostToDevice));
dim3 blocks(DIVUP(boxes_num, threadsPerBlock),
DIVUP(boxes_num, threadsPerBlock));
dim3 threads(threadsPerBlock);
nms_kernel<<<blocks, threads>>>(boxes_num, nms_overlap_thresh, boxes_dev,
mask_dev);
std::vector<unsigned long long> mask_host(boxes_num * col_blocks *
MULTIPLIER);
CUDA_CHECK(cudaMemcpy(
&mask_host[0], mask_dev,
sizeof(unsigned long long) * boxes_num * col_blocks * MULTIPLIER,
cudaMemcpyDeviceToHost));
std::vector<unsigned long long> remv(col_blocks * MULTIPLIER);
memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks * MULTIPLIER);
int num_to_keep = 0;
for (int i = 0; i < boxes_num; i++) {
int nblock = i / threadsPerBlock;
int inblock = i % threadsPerBlock;
int offset = inblock / LONGLONG_SIZE;
int bit_pos = inblock % LONGLONG_SIZE;
if (!(remv[nblock * MULTIPLIER + offset] & (1ULL << bit_pos))) {
keep_out[num_to_keep++] = i;
unsigned long long* p = &mask_host[0] + i * col_blocks * MULTIPLIER;
for (int j = nblock * MULTIPLIER + offset;
j < col_blocks * MULTIPLIER; j++) {
remv[j] |= p[j];
}
}
}
*num_out = num_to_keep;
if (!base) {
CUDA_CHECK(cudaFree(boxes_dev));
CUDA_CHECK(cudaFree(mask_dev));
}
}
import numpy as np import numpy as np
import torch import torch
from .gpu_nms import gpu_nms from . import nms_cuda, nms_cpu
from .cpu_nms import cpu_nms from .soft_nms_cpu import soft_nms_cpu
from .cpu_soft_nms import cpu_soft_nms
def nms(dets, iou_thr, device_id=None): def nms(dets, iou_thr, device_id=None):
"""Dispatch to either CPU or GPU NMS implementations.""" """Dispatch to either CPU or GPU NMS implementations.
The input can be either a torch tensor or numpy array. GPU NMS will be used
if the input is a gpu tensor or device_id is specified, otherwise CPU NMS
will be used. The returned type will always be the same as inputs.
Arguments:
dets (torch.Tensor or np.ndarray): bboxes with scores.
iou_thr (float): IoU threshold for NMS.
device_id (int, optional): when `dets` is a numpy array, if `device_id`
is None, then cpu nms is used, otherwise gpu_nms will be used.
Returns:
tuple: kept bboxes and indice, which is always the same data type as
the input.
"""
# convert dets (tensor or numpy array) to tensor
if isinstance(dets, torch.Tensor): if isinstance(dets, torch.Tensor):
is_tensor = True is_numpy = False
if dets.is_cuda: dets_th = dets
device_id = dets.get_device()
dets_np = dets.detach().cpu().numpy()
elif isinstance(dets, np.ndarray): elif isinstance(dets, np.ndarray):
is_tensor = False is_numpy = True
dets_np = dets device = 'cpu' if device_id is None else 'cuda:{}'.format(device_id)
dets_th = torch.from_numpy(dets).to(device)
else: else:
raise TypeError( raise TypeError(
'dets must be either a Tensor or numpy array, but got {}'.format( 'dets must be either a Tensor or numpy array, but got {}'.format(
type(dets))) type(dets)))
if dets_np.shape[0] == 0: # execute cpu or cuda nms
inds = [] if dets_th.shape[0] == 0:
inds = dets_th.new_zeros(0, dtype=torch.long)
else: else:
inds = (gpu_nms(dets_np, iou_thr, device_id=device_id) if dets_th.is_cuda:
if device_id is not None else cpu_nms(dets_np, iou_thr)) inds = nms_cuda.nms(dets_th, iou_thr)
else:
inds = nms_cpu.nms(dets_th, iou_thr)
if is_tensor: if is_numpy:
inds = dets.new_tensor(inds, dtype=torch.long) inds = inds.cpu().numpy()
else:
inds = np.array(inds, dtype=np.int64)
return dets[inds, :], inds return dets[inds, :], inds
...@@ -49,7 +64,7 @@ def soft_nms(dets, iou_thr, method='linear', sigma=0.5, min_score=1e-3): ...@@ -49,7 +64,7 @@ def soft_nms(dets, iou_thr, method='linear', sigma=0.5, min_score=1e-3):
method_codes = {'linear': 1, 'gaussian': 2} method_codes = {'linear': 1, 'gaussian': 2}
if method not in method_codes: if method not in method_codes:
raise ValueError('Invalid method for SoftNMS: {}'.format(method)) raise ValueError('Invalid method for SoftNMS: {}'.format(method))
new_dets, inds = cpu_soft_nms( new_dets, inds = soft_nms_cpu(
dets_np, dets_np,
iou_thr, iou_thr,
method=method_codes[method], method=method_codes[method],
......
import os.path as osp import os.path as osp
from distutils.core import setup, Extension from setuptools import setup, Extension
import numpy as np import numpy as np
from Cython.Build import cythonize from Cython.Build import cythonize
from Cython.Distutils import build_ext from Cython.Distutils import build_ext
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
# extensions
ext_args = dict( ext_args = dict(
include_dirs=[np.get_include()], include_dirs=[np.get_include()],
language='c++', language='c++',
...@@ -16,9 +16,7 @@ ext_args = dict( ...@@ -16,9 +16,7 @@ ext_args = dict(
) )
extensions = [ extensions = [
Extension('cpu_nms', ['cpu_nms.pyx'], **ext_args), Extension('soft_nms_cpu', ['src/soft_nms_cpu.pyx'], **ext_args),
Extension('cpu_soft_nms', ['cpu_soft_nms.pyx'], **ext_args),
Extension('gpu_nms', ['gpu_nms.pyx', 'nms_kernel.cu'], **ext_args),
] ]
...@@ -59,7 +57,6 @@ def customize_compiler_for_nvcc(self): ...@@ -59,7 +57,6 @@ def customize_compiler_for_nvcc(self):
self._compile = _compile self._compile = _compile
# run the customize_compiler
class custom_build_ext(build_ext): class custom_build_ext(build_ext):
def build_extensions(self): def build_extensions(self):
...@@ -68,7 +65,20 @@ class custom_build_ext(build_ext): ...@@ -68,7 +65,20 @@ class custom_build_ext(build_ext):
setup( setup(
name='nms', name='soft_nms',
cmdclass={'build_ext': custom_build_ext}, cmdclass={'build_ext': custom_build_ext},
ext_modules=cythonize(extensions), ext_modules=cythonize(extensions),
) )
setup(
name='nms_cuda',
ext_modules=[
CUDAExtension('nms_cuda', [
'src/nms_cuda.cpp',
'src/nms_kernel.cu',
]),
CUDAExtension('nms_cpu', [
'src/nms_cpu.cpp',
]),
],
cmdclass={'build_ext': BuildExtension})
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
#include <torch/extension.h>
template <typename scalar_t>
at::Tensor nms_cpu_kernel(const at::Tensor& dets, const float threshold) {
AT_ASSERTM(!dets.type().is_cuda(), "dets must be a CPU tensor");
if (dets.numel() == 0) {
return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU));
}
auto x1_t = dets.select(1, 0).contiguous();
auto y1_t = dets.select(1, 1).contiguous();
auto x2_t = dets.select(1, 2).contiguous();
auto y2_t = dets.select(1, 3).contiguous();
auto scores = dets.select(1, 4).contiguous();
at::Tensor areas_t = (x2_t - x1_t + 1) * (y2_t - y1_t + 1);
auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
auto ndets = dets.size(0);
at::Tensor suppressed_t =
at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU));
auto suppressed = suppressed_t.data<uint8_t>();
auto order = order_t.data<int64_t>();
auto x1 = x1_t.data<scalar_t>();
auto y1 = y1_t.data<scalar_t>();
auto x2 = x2_t.data<scalar_t>();
auto y2 = y2_t.data<scalar_t>();
auto areas = areas_t.data<scalar_t>();
for (int64_t _i = 0; _i < ndets; _i++) {
auto i = order[_i];
if (suppressed[i] == 1) continue;
auto ix1 = x1[i];
auto iy1 = y1[i];
auto ix2 = x2[i];
auto iy2 = y2[i];
auto iarea = areas[i];
for (int64_t _j = _i + 1; _j < ndets; _j++) {
auto j = order[_j];
if (suppressed[j] == 1) continue;
auto xx1 = std::max(ix1, x1[j]);
auto yy1 = std::max(iy1, y1[j]);
auto xx2 = std::min(ix2, x2[j]);
auto yy2 = std::min(iy2, y2[j]);
auto w = std::max(static_cast<scalar_t>(0), xx2 - xx1 + 1);
auto h = std::max(static_cast<scalar_t>(0), yy2 - yy1 + 1);
auto inter = w * h;
auto ovr = inter / (iarea + areas[j] - inter);
if (ovr >= threshold) suppressed[j] = 1;
}
}
return at::nonzero(suppressed_t == 0).squeeze(1);
}
at::Tensor nms(const at::Tensor& dets, const float threshold) {
at::Tensor result;
AT_DISPATCH_FLOATING_TYPES(dets.type(), "nms", [&] {
result = nms_cpu_kernel<scalar_t>(dets, threshold);
});
return result;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("nms", &nms, "non-maximum suppression");
}
\ No newline at end of file
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
#include <torch/extension.h>
#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh);
at::Tensor nms(const at::Tensor& dets, const float threshold) {
CHECK_CUDA(dets);
if (dets.numel() == 0)
return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU));
return nms_cuda(dets, threshold);
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("nms", &nms, "non-maximum suppression");
}
\ No newline at end of file
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <THC/THC.h>
#include <THC/THCDeviceUtils.cuh>
#include <vector>
#include <iostream>
int const threadsPerBlock = sizeof(unsigned long long) * 8;
__device__ inline float devIoU(float const * const a, float const * const b) {
float left = max(a[0], b[0]), right = min(a[2], b[2]);
float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
float interS = width * height;
float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
return interS / (Sa + Sb - interS);
}
__global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
const float *dev_boxes, unsigned long long *dev_mask) {
const int row_start = blockIdx.y;
const int col_start = blockIdx.x;
// if (row_start > col_start) return;
const int row_size =
min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
const int col_size =
min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
__shared__ float block_boxes[threadsPerBlock * 5];
if (threadIdx.x < col_size) {
block_boxes[threadIdx.x * 5 + 0] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
block_boxes[threadIdx.x * 5 + 1] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
block_boxes[threadIdx.x * 5 + 2] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
block_boxes[threadIdx.x * 5 + 3] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
block_boxes[threadIdx.x * 5 + 4] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
}
__syncthreads();
if (threadIdx.x < row_size) {
const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
const float *cur_box = dev_boxes + cur_box_idx * 5;
int i = 0;
unsigned long long t = 0;
int start = 0;
if (row_start == col_start) {
start = threadIdx.x + 1;
}
for (i = start; i < col_size; i++) {
if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
t |= 1ULL << i;
}
}
const int col_blocks = THCCeilDiv(n_boxes, threadsPerBlock);
dev_mask[cur_box_idx * col_blocks + col_start] = t;
}
}
// boxes is a N x 5 tensor
at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) {
using scalar_t = float;
AT_ASSERTM(boxes.type().is_cuda(), "boxes must be a CUDA tensor");
auto scores = boxes.select(1, 4);
auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
auto boxes_sorted = boxes.index_select(0, order_t);
int boxes_num = boxes.size(0);
const int col_blocks = THCCeilDiv(boxes_num, threadsPerBlock);
scalar_t* boxes_dev = boxes_sorted.data<scalar_t>();
THCState *state = at::globalContext().lazyInitCUDA(); // TODO replace with getTHCState
unsigned long long* mask_dev = NULL;
//THCudaCheck(THCudaMalloc(state, (void**) &mask_dev,
// boxes_num * col_blocks * sizeof(unsigned long long)));
mask_dev = (unsigned long long*) THCudaMalloc(state, boxes_num * col_blocks * sizeof(unsigned long long));
dim3 blocks(THCCeilDiv(boxes_num, threadsPerBlock),
THCCeilDiv(boxes_num, threadsPerBlock));
dim3 threads(threadsPerBlock);
nms_kernel<<<blocks, threads>>>(boxes_num,
nms_overlap_thresh,
boxes_dev,
mask_dev);
std::vector<unsigned long long> mask_host(boxes_num * col_blocks);
THCudaCheck(cudaMemcpy(&mask_host[0],
mask_dev,
sizeof(unsigned long long) * boxes_num * col_blocks,
cudaMemcpyDeviceToHost));
std::vector<unsigned long long> remv(col_blocks);
memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
at::Tensor keep = at::empty({boxes_num}, boxes.options().dtype(at::kLong).device(at::kCPU));
int64_t* keep_out = keep.data<int64_t>();
int num_to_keep = 0;
for (int i = 0; i < boxes_num; i++) {
int nblock = i / threadsPerBlock;
int inblock = i % threadsPerBlock;
if (!(remv[nblock] & (1ULL << inblock))) {
keep_out[num_to_keep++] = i;
unsigned long long *p = &mask_host[0] + i * col_blocks;
for (int j = nblock; j < col_blocks; j++) {
remv[j] |= p[j];
}
}
}
THCudaFree(state, mask_dev);
// TODO improve this part
return std::get<0>(order_t.index({
keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep).to(
order_t.device(), keep.scalar_type())
}).sort(0, false));
}
\ No newline at end of file
...@@ -19,7 +19,7 @@ cdef inline np.float32_t min(np.float32_t a, np.float32_t b): ...@@ -19,7 +19,7 @@ cdef inline np.float32_t min(np.float32_t a, np.float32_t b):
return a if a <= b else b return a if a <= b else b
def cpu_soft_nms( def soft_nms_cpu(
np.ndarray[float, ndim=2] boxes_in, np.ndarray[float, ndim=2] boxes_in,
float iou_thr, float iou_thr,
unsigned int method=1, unsigned int method=1,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment