Unverified Commit f74bfab6 authored by Nicolas Hug's avatar Nicolas Hug Committed by GitHub
Browse files

Add quantized version of nms (#3601)

* Add quantized version of nms

* Added tests

* Compute areas only once

* remove calls to dequantize_val

* fix return type for empty tensor

* flake8

* remove use of scale as it gets cancelled out

* simpler int convertion in tests

* explicitly set ovr to double

* add tests for more values of scale and zero_point

* comment about underflow

* remove unnecessary accessor

* properly convert to float for division

* Add comments about underflow

* explicitely cast coordinates to float to allow vectorization

* clang

* clang  again

* hopefully OK now
parent 978ba613
...@@ -138,8 +138,11 @@ def get_extensions(): ...@@ -138,8 +138,11 @@ def get_extensions():
main_file = glob.glob(os.path.join(extensions_dir, '*.cpp')) + glob.glob(os.path.join(extensions_dir, 'ops', main_file = glob.glob(os.path.join(extensions_dir, '*.cpp')) + glob.glob(os.path.join(extensions_dir, 'ops',
'*.cpp')) '*.cpp'))
source_cpu = glob.glob(os.path.join(extensions_dir, 'ops', 'autograd', '*.cpp')) + glob.glob( source_cpu = (
os.path.join(extensions_dir, 'ops', 'cpu', '*.cpp')) glob.glob(os.path.join(extensions_dir, 'ops', 'autograd', '*.cpp')) +
glob.glob(os.path.join(extensions_dir, 'ops', 'cpu', '*.cpp')) +
glob.glob(os.path.join(extensions_dir, 'ops', 'quantized', 'cpu', '*.cpp'))
)
is_rocm_pytorch = False is_rocm_pytorch = False
if torch.__version__ >= '1.5': if torch.__version__ >= '1.5':
......
...@@ -418,6 +418,29 @@ class NMSTester(unittest.TestCase): ...@@ -418,6 +418,29 @@ class NMSTester(unittest.TestCase):
self.assertRaises(RuntimeError, ops.nms, torch.rand(3, 4), torch.rand(3, 2), 0.5) self.assertRaises(RuntimeError, ops.nms, torch.rand(3, 4), torch.rand(3, 2), 0.5)
self.assertRaises(RuntimeError, ops.nms, torch.rand(3, 4), torch.rand(4), 0.5) self.assertRaises(RuntimeError, ops.nms, torch.rand(3, 4), torch.rand(4), 0.5)
def test_qnms(self):
# Note: we compare qnms vs nms instead of qnms vs reference implementation.
# This is because with the int convertion, the trick used in _create_tensors_with_iou
# doesn't really work (in fact, nms vs reference implem will also fail with ints)
err_msg = 'NMS and QNMS give different results for IoU={}'
for iou in [0.2, 0.5, 0.8]:
for scale, zero_point in ((1, 0), (2, 50), (3, 10)):
boxes, scores = self._create_tensors_with_iou(1000, iou)
scores *= 100 # otherwise most scores would be 0 or 1 after int convertion
qboxes = torch.quantize_per_tensor(boxes, scale=scale, zero_point=zero_point,
dtype=torch.quint8)
qscores = torch.quantize_per_tensor(scores, scale=scale, zero_point=zero_point,
dtype=torch.quint8)
boxes = qboxes.dequantize()
scores = qscores.dequantize()
keep = ops.nms(boxes, scores, iou)
qkeep = ops.nms(qboxes, qscores, iou)
self.assertTrue(torch.allclose(qkeep, keep), err_msg.format(iou))
@unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable") @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable")
def test_nms_cuda(self, dtype=torch.float64): def test_nms_cuda(self, dtype=torch.float64):
tol = 1e-3 if dtype is torch.half else 1e-5 tol = 1e-3 if dtype is torch.half else 1e-5
......
#include <ATen/ATen.h>
#include <ATen/native/quantized/affine_quantizer.h>
#include <torch/library.h>
namespace vision {
namespace ops {
namespace {
template <typename scalar_t>
at::Tensor qnms_kernel_impl(
const at::Tensor& dets,
const at::Tensor& scores,
double iou_threshold) {
TORCH_CHECK(!dets.is_cuda(), "dets must be a CPU tensor");
TORCH_CHECK(!scores.is_cuda(), "scores must be a CPU tensor");
TORCH_CHECK(
dets.scalar_type() == scores.scalar_type(),
"dets should have the same type as scores");
if (dets.numel() == 0)
return at::empty({0}, dets.options().dtype(at::kLong));
const auto ndets = dets.size(0);
auto x1_t = dets.select(1, 0).contiguous();
auto y1_t = dets.select(1, 1).contiguous();
auto x2_t = dets.select(1, 2).contiguous();
auto y2_t = dets.select(1, 3).contiguous();
auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
at::Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte));
at::Tensor keep_t = at::zeros({ndets}, dets.options().dtype(at::kLong));
at::Tensor areas_t = at::zeros({ndets}, dets.options().dtype(at::kFloat));
auto suppressed = suppressed_t.data_ptr<uint8_t>();
auto keep = keep_t.data_ptr<int64_t>();
auto order = order_t.data_ptr<int64_t>();
auto x1 = x1_t.data_ptr<scalar_t>();
auto y1 = y1_t.data_ptr<scalar_t>();
auto x2 = x2_t.data_ptr<scalar_t>();
auto y2 = y2_t.data_ptr<scalar_t>();
auto areas = areas_t.data_ptr<float>();
for (int64_t i = 0; i < ndets; i++) {
// Note 1: To get the exact area we'd need to multiply by scale**2, but this
// would get canceled out in the computation of ovr below. So we leave that
// out.
// Note 2: degenerate boxes (x2 < x1 or y2 < y1) may underflow, although
// integral promotion rules will likely prevent it (see
// https://stackoverflow.com/questions/32959564/subtraction-of-two-unsigned-gives-signed
// for more details).
areas[i] = (x2[i].val_ - x1[i].val_) * (y2[i].val_ - y1[i].val_);
}
int64_t num_to_keep = 0;
for (int64_t _i = 0; _i < ndets; _i++) {
auto i = order[_i];
if (suppressed[i] == 1)
continue;
keep[num_to_keep++] = i;
// We explicitely cast coordinates to float so that the code can be
// vectorized.
float ix1val = x1[i].val_;
float iy1val = y1[i].val_;
float ix2val = x2[i].val_;
float iy2val = y2[i].val_;
float iarea = areas[i];
for (int64_t _j = _i + 1; _j < ndets; _j++) {
auto j = order[_j];
if (suppressed[j] == 1)
continue;
float xx1 = std::max(ix1val, (float)x1[j].val_);
float yy1 = std::max(iy1val, (float)y1[j].val_);
float xx2 = std::min(ix2val, (float)x2[j].val_);
float yy2 = std::min(iy2val, (float)y2[j].val_);
auto w = std::max(0.f, xx2 - xx1); // * scale (gets canceled below)
auto h = std::max(0.f, yy2 - yy1); // * scale (gets canceled below)
auto inter = w * h;
auto ovr = inter / (iarea + areas[j] - inter);
if (ovr > iou_threshold)
suppressed[j] = 1;
}
}
return keep_t.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep);
}
at::Tensor qnms_kernel(
const at::Tensor& dets,
const at::Tensor& scores,
double iou_threshold) {
TORCH_CHECK(
dets.dim() == 2, "boxes should be a 2d tensor, got ", dets.dim(), "D");
TORCH_CHECK(
dets.size(1) == 4,
"boxes should have 4 elements in dimension 1, got ",
dets.size(1));
TORCH_CHECK(
scores.dim() == 1,
"scores should be a 1d tensor, got ",
scores.dim(),
"D");
TORCH_CHECK(
dets.size(0) == scores.size(0),
"boxes and scores should have same number of elements in ",
"dimension 0, got ",
dets.size(0),
" and ",
scores.size(0));
auto result = at::empty({0});
AT_DISPATCH_QINT_TYPES(dets.scalar_type(), "qnms_kernel", [&] {
result = qnms_kernel_impl<scalar_t>(dets, scores, iou_threshold);
});
return result;
}
} // namespace
TORCH_LIBRARY_IMPL(torchvision, QuantizedCPU, m) {
m.impl(TORCH_SELECTIVE_NAME("torchvision::nms"), TORCH_FN(qnms_kernel));
}
} // namespace ops
} // namespace vision
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment