Unverified Commit e421e832 authored by Kai Chen's avatar Kai Chen Committed by GitHub
Browse files

port nms extension from maskrcnn-benchmark (#469)

* port nms extension from maskrcnn-benchmark

* fix linting error
parent ee7e679a
......@@ -104,7 +104,7 @@ venv.bak/
.mypy_cache/
# cython generated cpp
mmdet/ops/nms/*.cpp
mmdet/ops/nms/src/soft_nms_cpu.cpp
mmdet/version.py
data
.vscode
......
......@@ -18,8 +18,10 @@ $PYTHON setup.py build_ext --inplace
echo "Building nms op..."
cd ../nms
make clean
make PYTHON=${PYTHON}
if [ -d "build" ]; then
rm -r build
fi
$PYTHON setup.py build_ext --inplace
echo "Building dcn..."
cd ../dcn
......
PYTHON=${PYTHON:-python}
all:
echo "Compiling nms kernels..."
$(PYTHON) setup.py build_ext --inplace
clean:
rm -f *.so
# --------------------------------------------------------
# Fast R-CNN
# Copyright (c) 2015 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Ross Girshick
# --------------------------------------------------------
# cython: language_level=3, boundscheck=False
import numpy as np
cimport numpy as np
cdef inline np.float32_t max(np.float32_t a, np.float32_t b):
return a if a >= b else b
cdef inline np.float32_t min(np.float32_t a, np.float32_t b):
return a if a <= b else b
def cpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh):
cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0]
cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1]
cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2]
cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3]
cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4]
cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1)
cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1]
cdef int ndets = dets.shape[0]
cdef np.ndarray[np.int_t, ndim=1] suppressed = \
np.zeros((ndets), dtype=np.int)
# nominal indices
cdef int _i, _j
# sorted indices
cdef int i, j
# temp variables for box i's (the box currently under consideration)
cdef np.float32_t ix1, iy1, ix2, iy2, iarea
# variables for computing overlap with box j (lower scoring box)
cdef np.float32_t xx1, yy1, xx2, yy2
cdef np.float32_t w, h
cdef np.float32_t inter, ovr
keep = []
for _i in range(ndets):
i = order[_i]
if suppressed[i] == 1:
continue
keep.append(i)
ix1 = x1[i]
iy1 = y1[i]
ix2 = x2[i]
iy2 = y2[i]
iarea = areas[i]
for _j in range(_i + 1, ndets):
j = order[_j]
if suppressed[j] == 1:
continue
xx1 = max(ix1, x1[j])
yy1 = max(iy1, y1[j])
xx2 = min(ix2, x2[j])
yy2 = min(iy2, y2[j])
w = max(0.0, xx2 - xx1 + 1)
h = max(0.0, yy2 - yy1 + 1)
inter = w * h
ovr = inter / (iarea + areas[j] - inter)
if ovr >= thresh:
suppressed[j] = 1
return keep
void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num,
int boxes_dim, float nms_overlap_thresh, int device_id, size_t base);
size_t nms_Malloc();
# --------------------------------------------------------
# Faster R-CNN
# Copyright (c) 2015 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Ross Girshick
# --------------------------------------------------------
# cython: language_level=3, boundscheck=False
import numpy as np
cimport numpy as np
assert sizeof(int) == sizeof(np.int32_t)
cdef extern from "gpu_nms.hpp":
void _nms(np.int32_t*, int*, np.float32_t*, int, int, float, int, size_t) nogil
size_t nms_Malloc() nogil
memory_pool = {}
def gpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh,
np.int32_t device_id=0):
cdef int boxes_num = dets.shape[0]
cdef int boxes_dim = 5
cdef int num_out
cdef size_t base
cdef np.ndarray[np.int32_t, ndim=1] \
keep = np.zeros(boxes_num, dtype=np.int32)
cdef np.ndarray[np.float32_t, ndim=1] \
scores = dets[:, 4]
cdef np.ndarray[np.int_t, ndim=1] \
order = scores.argsort()[::-1]
cdef np.ndarray[np.float32_t, ndim=2] \
sorted_dets = dets[order, :5]
cdef float cthresh = thresh
if device_id not in memory_pool:
with nogil:
base = nms_Malloc()
memory_pool[device_id] = base
# print "malloc", base
base = memory_pool[device_id]
with nogil:
_nms(&keep[0], &num_out, &sorted_dets[0, 0], boxes_num, boxes_dim, cthresh, device_id, base)
keep = keep[:num_out]
return list(order[keep])
// ------------------------------------------------------------------
// Faster R-CNN
// Copyright (c) 2015 Microsoft
// Licensed under The MIT License [see fast-rcnn/LICENSE for details]
// Written by Shaoqing Ren
// ------------------------------------------------------------------
#include <stdio.h>
#include <iostream>
#include <vector>
#include "gpu_nms.hpp"
#define CUDA_CHECK(condition) \
/* Code block avoids redefinition of cudaError_t error */ \
do { \
cudaError_t error = condition; \
if (error != cudaSuccess) { \
std::cout << cudaGetErrorString(error) << std::endl; \
} \
} while (0)
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
#define MULTIPLIER 16
#define LONGLONG_SIZE 64
int const threadsPerBlock =
sizeof(unsigned long long) * 8 *
MULTIPLIER; // number of bits for a long long variable
__device__ inline float devIoU(float const* const a, float const* const b) {
float left = max(a[0], b[0]), right = min(a[2], b[2]);
float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
float width = max(right - left + 1, 0.f),
height = max(bottom - top + 1, 0.f);
float interS = width * height;
float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
return interS / (Sa + Sb - interS);
}
__global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
const float* dev_boxes,
unsigned long long* dev_mask) {
const int row_start = blockIdx.y;
const int col_start = blockIdx.x;
// if (row_start > col_start) return;
const int row_size =
min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
const int col_size =
min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
__shared__ float block_boxes[threadsPerBlock * 5];
if (threadIdx.x < col_size) {
block_boxes[threadIdx.x * 5 + 0] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
block_boxes[threadIdx.x * 5 + 1] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
block_boxes[threadIdx.x * 5 + 2] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
block_boxes[threadIdx.x * 5 + 3] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
block_boxes[threadIdx.x * 5 + 4] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
}
__syncthreads();
unsigned long long ts[MULTIPLIER];
if (threadIdx.x < row_size) {
#pragma unroll
for (int i = 0; i < MULTIPLIER; ++i) {
ts[i] = 0;
}
const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
const float* cur_box = dev_boxes + cur_box_idx * 5;
int i = 0;
int start = 0;
if (row_start == col_start) {
start = threadIdx.x + 1;
}
for (i = start; i < col_size; i++) {
if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
ts[i / LONGLONG_SIZE] |= 1ULL << (i % LONGLONG_SIZE);
}
}
const int col_blocks = DIVUP(n_boxes, threadsPerBlock);
#pragma unroll
for (int i = 0; i < MULTIPLIER; ++i) {
dev_mask[(cur_box_idx * col_blocks + col_start) * MULTIPLIER + i] =
ts[i];
}
}
}
void _set_device(int device_id) {
int current_device;
CUDA_CHECK(cudaGetDevice(&current_device));
if (current_device == device_id) {
return;
}
// The call to cudaSetDevice must come before any calls to Get, which
// may perform initialization using the GPU.
CUDA_CHECK(cudaSetDevice(device_id));
}
const size_t MEMORY_SIZE = 500000000;
size_t nms_Malloc() {
float* boxes_dev = NULL;
CUDA_CHECK(cudaMalloc(&boxes_dev, MEMORY_SIZE));
return size_t(boxes_dev);
}
void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num,
int boxes_dim, float nms_overlap_thresh, int device_id, size_t base) {
_set_device(device_id);
float* boxes_dev = NULL;
unsigned long long* mask_dev = NULL;
const int col_blocks = DIVUP(boxes_num, threadsPerBlock);
if (base > 0) {
size_t require_mem =
boxes_num * boxes_dim * sizeof(float) +
boxes_num * col_blocks * sizeof(unsigned long long) * MULTIPLIER;
if (require_mem >= MEMORY_SIZE) {
std::cout << "require_mem: " << require_mem << std::endl;
}
boxes_dev = (float*)(base);
mask_dev =
(unsigned long long*)(base +
512 * ((unsigned long long)(boxes_num *
boxes_dim *
sizeof(float) /
512) +
1));
} else {
CUDA_CHECK(
cudaMalloc(&boxes_dev, boxes_num * boxes_dim * sizeof(float)));
CUDA_CHECK(cudaMalloc(&mask_dev, MULTIPLIER * boxes_num * col_blocks *
sizeof(unsigned long long)));
}
CUDA_CHECK(cudaMemcpy(boxes_dev, boxes_host,
boxes_num * boxes_dim * sizeof(float),
cudaMemcpyHostToDevice));
dim3 blocks(DIVUP(boxes_num, threadsPerBlock),
DIVUP(boxes_num, threadsPerBlock));
dim3 threads(threadsPerBlock);
nms_kernel<<<blocks, threads>>>(boxes_num, nms_overlap_thresh, boxes_dev,
mask_dev);
std::vector<unsigned long long> mask_host(boxes_num * col_blocks *
MULTIPLIER);
CUDA_CHECK(cudaMemcpy(
&mask_host[0], mask_dev,
sizeof(unsigned long long) * boxes_num * col_blocks * MULTIPLIER,
cudaMemcpyDeviceToHost));
std::vector<unsigned long long> remv(col_blocks * MULTIPLIER);
memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks * MULTIPLIER);
int num_to_keep = 0;
for (int i = 0; i < boxes_num; i++) {
int nblock = i / threadsPerBlock;
int inblock = i % threadsPerBlock;
int offset = inblock / LONGLONG_SIZE;
int bit_pos = inblock % LONGLONG_SIZE;
if (!(remv[nblock * MULTIPLIER + offset] & (1ULL << bit_pos))) {
keep_out[num_to_keep++] = i;
unsigned long long* p = &mask_host[0] + i * col_blocks * MULTIPLIER;
for (int j = nblock * MULTIPLIER + offset;
j < col_blocks * MULTIPLIER; j++) {
remv[j] |= p[j];
}
}
}
*num_out = num_to_keep;
if (!base) {
CUDA_CHECK(cudaFree(boxes_dev));
CUDA_CHECK(cudaFree(mask_dev));
}
}
import numpy as np
import torch
from .gpu_nms import gpu_nms
from .cpu_nms import cpu_nms
from .cpu_soft_nms import cpu_soft_nms
from . import nms_cuda, nms_cpu
from .soft_nms_cpu import soft_nms_cpu
def nms(dets, iou_thr, device_id=None):
"""Dispatch to either CPU or GPU NMS implementations."""
"""Dispatch to either CPU or GPU NMS implementations.
The input can be either a torch tensor or numpy array. GPU NMS will be used
if the input is a gpu tensor or device_id is specified, otherwise CPU NMS
will be used. The returned type will always be the same as inputs.
Arguments:
dets (torch.Tensor or np.ndarray): bboxes with scores.
iou_thr (float): IoU threshold for NMS.
device_id (int, optional): when `dets` is a numpy array, if `device_id`
is None, then cpu nms is used, otherwise gpu_nms will be used.
Returns:
tuple: kept bboxes and indice, which is always the same data type as
the input.
"""
# convert dets (tensor or numpy array) to tensor
if isinstance(dets, torch.Tensor):
is_tensor = True
if dets.is_cuda:
device_id = dets.get_device()
dets_np = dets.detach().cpu().numpy()
is_numpy = False
dets_th = dets
elif isinstance(dets, np.ndarray):
is_tensor = False
dets_np = dets
is_numpy = True
device = 'cpu' if device_id is None else 'cuda:{}'.format(device_id)
dets_th = torch.from_numpy(dets).to(device)
else:
raise TypeError(
'dets must be either a Tensor or numpy array, but got {}'.format(
type(dets)))
if dets_np.shape[0] == 0:
inds = []
# execute cpu or cuda nms
if dets_th.shape[0] == 0:
inds = dets_th.new_zeros(0, dtype=torch.long)
else:
inds = (gpu_nms(dets_np, iou_thr, device_id=device_id)
if device_id is not None else cpu_nms(dets_np, iou_thr))
if is_tensor:
inds = dets.new_tensor(inds, dtype=torch.long)
if dets_th.is_cuda:
inds = nms_cuda.nms(dets_th, iou_thr)
else:
inds = np.array(inds, dtype=np.int64)
inds = nms_cpu.nms(dets_th, iou_thr)
if is_numpy:
inds = inds.cpu().numpy()
return dets[inds, :], inds
......@@ -49,7 +64,7 @@ def soft_nms(dets, iou_thr, method='linear', sigma=0.5, min_score=1e-3):
method_codes = {'linear': 1, 'gaussian': 2}
if method not in method_codes:
raise ValueError('Invalid method for SoftNMS: {}'.format(method))
new_dets, inds = cpu_soft_nms(
new_dets, inds = soft_nms_cpu(
dets_np,
iou_thr,
method=method_codes[method],
......
import os.path as osp
from distutils.core import setup, Extension
from setuptools import setup, Extension
import numpy as np
from Cython.Build import cythonize
from Cython.Distutils import build_ext
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
# extensions
ext_args = dict(
include_dirs=[np.get_include()],
language='c++',
......@@ -16,9 +16,7 @@ ext_args = dict(
)
extensions = [
Extension('cpu_nms', ['cpu_nms.pyx'], **ext_args),
Extension('cpu_soft_nms', ['cpu_soft_nms.pyx'], **ext_args),
Extension('gpu_nms', ['gpu_nms.pyx', 'nms_kernel.cu'], **ext_args),
Extension('soft_nms_cpu', ['src/soft_nms_cpu.pyx'], **ext_args),
]
......@@ -59,7 +57,6 @@ def customize_compiler_for_nvcc(self):
self._compile = _compile
# run the customize_compiler
class custom_build_ext(build_ext):
def build_extensions(self):
......@@ -68,7 +65,20 @@ class custom_build_ext(build_ext):
setup(
name='nms',
name='soft_nms',
cmdclass={'build_ext': custom_build_ext},
ext_modules=cythonize(extensions),
)
setup(
name='nms_cuda',
ext_modules=[
CUDAExtension('nms_cuda', [
'src/nms_cuda.cpp',
'src/nms_kernel.cu',
]),
CUDAExtension('nms_cpu', [
'src/nms_cpu.cpp',
]),
],
cmdclass={'build_ext': BuildExtension})
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
#include <torch/extension.h>
template <typename scalar_t>
at::Tensor nms_cpu_kernel(const at::Tensor& dets, const float threshold) {
AT_ASSERTM(!dets.type().is_cuda(), "dets must be a CPU tensor");
if (dets.numel() == 0) {
return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU));
}
auto x1_t = dets.select(1, 0).contiguous();
auto y1_t = dets.select(1, 1).contiguous();
auto x2_t = dets.select(1, 2).contiguous();
auto y2_t = dets.select(1, 3).contiguous();
auto scores = dets.select(1, 4).contiguous();
at::Tensor areas_t = (x2_t - x1_t + 1) * (y2_t - y1_t + 1);
auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
auto ndets = dets.size(0);
at::Tensor suppressed_t =
at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU));
auto suppressed = suppressed_t.data<uint8_t>();
auto order = order_t.data<int64_t>();
auto x1 = x1_t.data<scalar_t>();
auto y1 = y1_t.data<scalar_t>();
auto x2 = x2_t.data<scalar_t>();
auto y2 = y2_t.data<scalar_t>();
auto areas = areas_t.data<scalar_t>();
for (int64_t _i = 0; _i < ndets; _i++) {
auto i = order[_i];
if (suppressed[i] == 1) continue;
auto ix1 = x1[i];
auto iy1 = y1[i];
auto ix2 = x2[i];
auto iy2 = y2[i];
auto iarea = areas[i];
for (int64_t _j = _i + 1; _j < ndets; _j++) {
auto j = order[_j];
if (suppressed[j] == 1) continue;
auto xx1 = std::max(ix1, x1[j]);
auto yy1 = std::max(iy1, y1[j]);
auto xx2 = std::min(ix2, x2[j]);
auto yy2 = std::min(iy2, y2[j]);
auto w = std::max(static_cast<scalar_t>(0), xx2 - xx1 + 1);
auto h = std::max(static_cast<scalar_t>(0), yy2 - yy1 + 1);
auto inter = w * h;
auto ovr = inter / (iarea + areas[j] - inter);
if (ovr >= threshold) suppressed[j] = 1;
}
}
return at::nonzero(suppressed_t == 0).squeeze(1);
}
at::Tensor nms(const at::Tensor& dets, const float threshold) {
at::Tensor result;
AT_DISPATCH_FLOATING_TYPES(dets.type(), "nms", [&] {
result = nms_cpu_kernel<scalar_t>(dets, threshold);
});
return result;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("nms", &nms, "non-maximum suppression");
}
\ No newline at end of file
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
#include <torch/extension.h>
#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh);
at::Tensor nms(const at::Tensor& dets, const float threshold) {
CHECK_CUDA(dets);
if (dets.numel() == 0)
return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU));
return nms_cuda(dets, threshold);
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("nms", &nms, "non-maximum suppression");
}
\ No newline at end of file
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <THC/THC.h>
#include <THC/THCDeviceUtils.cuh>
#include <vector>
#include <iostream>
int const threadsPerBlock = sizeof(unsigned long long) * 8;
__device__ inline float devIoU(float const * const a, float const * const b) {
float left = max(a[0], b[0]), right = min(a[2], b[2]);
float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
float interS = width * height;
float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
return interS / (Sa + Sb - interS);
}
__global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
const float *dev_boxes, unsigned long long *dev_mask) {
const int row_start = blockIdx.y;
const int col_start = blockIdx.x;
// if (row_start > col_start) return;
const int row_size =
min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
const int col_size =
min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
__shared__ float block_boxes[threadsPerBlock * 5];
if (threadIdx.x < col_size) {
block_boxes[threadIdx.x * 5 + 0] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
block_boxes[threadIdx.x * 5 + 1] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
block_boxes[threadIdx.x * 5 + 2] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
block_boxes[threadIdx.x * 5 + 3] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
block_boxes[threadIdx.x * 5 + 4] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
}
__syncthreads();
if (threadIdx.x < row_size) {
const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
const float *cur_box = dev_boxes + cur_box_idx * 5;
int i = 0;
unsigned long long t = 0;
int start = 0;
if (row_start == col_start) {
start = threadIdx.x + 1;
}
for (i = start; i < col_size; i++) {
if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
t |= 1ULL << i;
}
}
const int col_blocks = THCCeilDiv(n_boxes, threadsPerBlock);
dev_mask[cur_box_idx * col_blocks + col_start] = t;
}
}
// boxes is a N x 5 tensor
at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) {
using scalar_t = float;
AT_ASSERTM(boxes.type().is_cuda(), "boxes must be a CUDA tensor");
auto scores = boxes.select(1, 4);
auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
auto boxes_sorted = boxes.index_select(0, order_t);
int boxes_num = boxes.size(0);
const int col_blocks = THCCeilDiv(boxes_num, threadsPerBlock);
scalar_t* boxes_dev = boxes_sorted.data<scalar_t>();
THCState *state = at::globalContext().lazyInitCUDA(); // TODO replace with getTHCState
unsigned long long* mask_dev = NULL;
//THCudaCheck(THCudaMalloc(state, (void**) &mask_dev,
// boxes_num * col_blocks * sizeof(unsigned long long)));
mask_dev = (unsigned long long*) THCudaMalloc(state, boxes_num * col_blocks * sizeof(unsigned long long));
dim3 blocks(THCCeilDiv(boxes_num, threadsPerBlock),
THCCeilDiv(boxes_num, threadsPerBlock));
dim3 threads(threadsPerBlock);
nms_kernel<<<blocks, threads>>>(boxes_num,
nms_overlap_thresh,
boxes_dev,
mask_dev);
std::vector<unsigned long long> mask_host(boxes_num * col_blocks);
THCudaCheck(cudaMemcpy(&mask_host[0],
mask_dev,
sizeof(unsigned long long) * boxes_num * col_blocks,
cudaMemcpyDeviceToHost));
std::vector<unsigned long long> remv(col_blocks);
memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
at::Tensor keep = at::empty({boxes_num}, boxes.options().dtype(at::kLong).device(at::kCPU));
int64_t* keep_out = keep.data<int64_t>();
int num_to_keep = 0;
for (int i = 0; i < boxes_num; i++) {
int nblock = i / threadsPerBlock;
int inblock = i % threadsPerBlock;
if (!(remv[nblock] & (1ULL << inblock))) {
keep_out[num_to_keep++] = i;
unsigned long long *p = &mask_host[0] + i * col_blocks;
for (int j = nblock; j < col_blocks; j++) {
remv[j] |= p[j];
}
}
}
THCudaFree(state, mask_dev);
// TODO improve this part
return std::get<0>(order_t.index({
keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep).to(
order_t.device(), keep.scalar_type())
}).sort(0, false));
}
\ No newline at end of file
......@@ -19,7 +19,7 @@ cdef inline np.float32_t min(np.float32_t a, np.float32_t b):
return a if a <= b else b
def cpu_soft_nms(
def soft_nms_cpu(
np.ndarray[float, ndim=2] boxes_in,
float iou_thr,
unsigned int method=1,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment