nms_cpu.cpp 3.22 KB
Newer Older
Hang Zhang's avatar
Hang Zhang committed
1
#include <torch/extension.h>
Hang Zhang's avatar
Hang Zhang committed
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
#include <ATen/ATen.h>
#include <ATen/NativeFunctions.h>

#ifdef _OPENMP
#include <omp.h>
#endif

template<typename scalar>
inline scalar IoU(scalar* rawInput, int idx_x, int idx_y) {
    scalar lr = std::fmin(rawInput[idx_x*4] + rawInput[idx_x*4+2],
                         rawInput[idx_y*4] + rawInput[idx_y*4+2]);
    scalar rl = std::fmax(rawInput[idx_x*4], rawInput[idx_y*4]);
    scalar tb = std::fmin(rawInput[idx_x*4+1] + rawInput[idx_x*4+3],
                         rawInput[idx_y*4+1] + rawInput[idx_y*4+3]);
    scalar bt = std::fmax(rawInput[idx_x*4+1], rawInput[idx_y*4+1]);
    scalar inter = std::fmax(0, lr-rl)*std::fmax(0, tb-bt);
    scalar uni = (rawInput[idx_x*4+2]*rawInput[idx_x*4+3] 
                 + rawInput[idx_y*4+2]*rawInput[idx_y*4+3] - inter);
    return inter/uni;
}


std::vector<at::Tensor> Non_Max_Suppression_CPU(
    const at::Tensor& input,
    const at::Tensor& scores,
    double thresh) {
  AT_ASSERT(input.ndimension() == 3);
  AT_ASSERT(scores.ndimension() == 2);
  AT_ASSERT(input.size(0) == scores.size(0));
  AT_ASSERT(input.size(1) == scores.size(1));
  AT_ASSERT(input.size(2) == 4);
  AT_ASSERT(input.is_contiguous());
  AT_ASSERT(scores.is_contiguous());
Hang Zhang's avatar
Hang Zhang committed
35
36
  AT_ASSERT(input.type().scalarType() == at::kFloat || input.type().scalarType() == at::kDouble);
  AT_ASSERT(scores.type().scalarType() == at::kFloat || scores.type().scalarType() == at::kDouble);
Hang Zhang's avatar
Hang Zhang committed
37
38
39
40
41
42
43
44
45
  AT_ASSERT(input.is_contiguous());
  AT_ASSERT(scores.is_contiguous());

 
  at::Tensor sorted_inds = std::get<1>(scores.sort(-1, true));
  //at::Tensor rawIdx = std::get<1>(scores.sort(-1, true));
  
  auto num_boxes = input.size(1);
  auto batch_size = input.size(0);
46
47
  auto mask = torch::zeros({batch_size, num_boxes}, input.type().toScalarType(at::kByte));
  //auto mask = input.type().toScalarType(at::kByte).tensor({batch_size, num_boxes});
Hang Zhang's avatar
Hang Zhang committed
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
  mask.fill_(1);
  auto *rawMask = mask.data<unsigned char>();
  auto *rawIdx = sorted_inds.data<int64_t>();

  if (input.type().scalarType() == at::kFloat)
  {
    auto *rawInput = input.data<float>();

    for(int batch=0; batch<batch_size; ++batch)
    {
      int pos=batch*num_boxes;
      while(pos < (1+batch)*num_boxes-1)
      {
#pragma omp parallel for
        for(int i=pos+1; i<num_boxes*(1+batch); ++i)
        {
          int idx_x = rawIdx[pos]+num_boxes*batch;
          int idx_y = rawIdx[i]+num_boxes*batch;
          if (IoU(rawInput, idx_x, idx_y) > thresh)
            rawMask[i] = 0;
        }
        ++pos;
        while(pos < (1+batch)*num_boxes-1 and (rawMask[pos] == 0))
          ++pos;
      }
    }
  }
  else
  {
    auto *rawInput = input.data<double>();
    for(int batch=0; batch<batch_size; ++batch)
    {
      int pos=batch*num_boxes;
      while(pos < (1+batch)*num_boxes-1)
      {
#pragma omp parallel for
        for(int i=pos+1; i<num_boxes*(1+batch); ++i)
        {
          int idx_x = rawIdx[pos]+num_boxes*batch;
          int idx_y = rawIdx[i]+num_boxes*batch;
          if (IoU(rawInput, idx_x, idx_y) > thresh)
            rawMask[i] = 0;
        }
        ++pos;
        while(pos < (1+batch)*num_boxes-1 and (rawMask[pos] == 0))
          ++pos;
      }
    }
  }
  //see ./cuda/NonMaxSuppression.cu for comment about return value.
  return {mask, sorted_inds};
}