// ------------------------------------------------------------------ // Faster R-CNN // Copyright (c) 2015 Microsoft // Licensed under The MIT License [see fast-rcnn/LICENSE for details] // Written by Shaoqing Ren // ------------------------------------------------------------------ #include #include #include #include "gpu_nms.hpp" #define CUDA_CHECK(condition) \ /* Code block avoids redefinition of cudaError_t error */ \ do { \ cudaError_t error = condition; \ if (error != cudaSuccess) { \ std::cout << cudaGetErrorString(error) << std::endl; \ } \ } while (0) #define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0)) #define MULTIPLIER 16 #define LONGLONG_SIZE 64 int const threadsPerBlock = sizeof(unsigned long long) * 8 * MULTIPLIER; // number of bits for a long long variable __device__ inline float devIoU(float const* const a, float const* const b) { float left = max(a[0], b[0]), right = min(a[2], b[2]); float top = max(a[1], b[1]), bottom = min(a[3], b[3]); float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); float interS = width * height; float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); return interS / (Sa + Sb - interS); } __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh, const float* dev_boxes, unsigned long long* dev_mask) { const int row_start = blockIdx.y; const int col_start = blockIdx.x; // if (row_start > col_start) return; const int row_size = min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); const int col_size = min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); __shared__ float block_boxes[threadsPerBlock * 5]; if (threadIdx.x < col_size) { block_boxes[threadIdx.x * 5 + 0] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0]; block_boxes[threadIdx.x * 5 + 1] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1]; block_boxes[threadIdx.x * 5 + 2] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2]; block_boxes[threadIdx.x * 5 + 3] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3]; block_boxes[threadIdx.x * 5 + 4] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4]; } __syncthreads(); unsigned long long ts[MULTIPLIER]; if (threadIdx.x < row_size) { #pragma unroll for (int i = 0; i < MULTIPLIER; ++i) { ts[i] = 0; } const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; const float* cur_box = dev_boxes + cur_box_idx * 5; int i = 0; int start = 0; if (row_start == col_start) { start = threadIdx.x + 1; } for (i = start; i < col_size; i++) { if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { ts[i / LONGLONG_SIZE] |= 1ULL << (i % LONGLONG_SIZE); } } const int col_blocks = DIVUP(n_boxes, threadsPerBlock); #pragma unroll for (int i = 0; i < MULTIPLIER; ++i) { dev_mask[(cur_box_idx * col_blocks + col_start) * MULTIPLIER + i] = ts[i]; } } } void _set_device(int device_id) { int current_device; CUDA_CHECK(cudaGetDevice(¤t_device)); if (current_device == device_id) { return; } // The call to cudaSetDevice must come before any calls to Get, which // may perform initialization using the GPU. CUDA_CHECK(cudaSetDevice(device_id)); } const size_t MEMORY_SIZE = 500000000; size_t nms_Malloc() { float* boxes_dev = NULL; CUDA_CHECK(cudaMalloc(&boxes_dev, MEMORY_SIZE)); return size_t(boxes_dev); } void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num, int boxes_dim, float nms_overlap_thresh, int device_id, size_t base) { _set_device(device_id); float* boxes_dev = NULL; unsigned long long* mask_dev = NULL; const int col_blocks = DIVUP(boxes_num, threadsPerBlock); if (base > 0) { size_t require_mem = boxes_num * boxes_dim * sizeof(float) + boxes_num * col_blocks * sizeof(unsigned long long) * MULTIPLIER; if (require_mem >= MEMORY_SIZE) { std::cout << "require_mem: " << require_mem << std::endl; } boxes_dev = (float*)(base); mask_dev = (unsigned long long*)(base + 512 * ((unsigned long long)(boxes_num * boxes_dim * sizeof(float) / 512) + 1)); } else { CUDA_CHECK( cudaMalloc(&boxes_dev, boxes_num * boxes_dim * sizeof(float))); CUDA_CHECK(cudaMalloc(&mask_dev, MULTIPLIER * boxes_num * col_blocks * sizeof(unsigned long long))); } CUDA_CHECK(cudaMemcpy(boxes_dev, boxes_host, boxes_num * boxes_dim * sizeof(float), cudaMemcpyHostToDevice)); dim3 blocks(DIVUP(boxes_num, threadsPerBlock), DIVUP(boxes_num, threadsPerBlock)); dim3 threads(threadsPerBlock); nms_kernel<<>>(boxes_num, nms_overlap_thresh, boxes_dev, mask_dev); std::vector mask_host(boxes_num * col_blocks * MULTIPLIER); CUDA_CHECK(cudaMemcpy( &mask_host[0], mask_dev, sizeof(unsigned long long) * boxes_num * col_blocks * MULTIPLIER, cudaMemcpyDeviceToHost)); std::vector remv(col_blocks * MULTIPLIER); memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks * MULTIPLIER); int num_to_keep = 0; for (int i = 0; i < boxes_num; i++) { int nblock = i / threadsPerBlock; int inblock = i % threadsPerBlock; int offset = inblock / LONGLONG_SIZE; int bit_pos = inblock % LONGLONG_SIZE; if (!(remv[nblock * MULTIPLIER + offset] & (1ULL << bit_pos))) { keep_out[num_to_keep++] = i; unsigned long long* p = &mask_host[0] + i * col_blocks * MULTIPLIER; for (int j = nblock * MULTIPLIER + offset; j < col_blocks * MULTIPLIER; j++) { remv[j] |= p[j]; } } } *num_out = num_to_keep; if (!base) { CUDA_CHECK(cudaFree(boxes_dev)); CUDA_CHECK(cudaFree(mask_dev)); } }