Commit 3144257c authored by mashun1's avatar mashun1
Browse files

catvton

parents
// Copyright (c) Facebook, Inc. and its affiliates.
#pragma once
#include <torch/types.h>
namespace detectron2 {
at::Tensor nms_rotated_cpu(
const at::Tensor& dets,
const at::Tensor& scores,
const double iou_threshold);
#if defined(WITH_CUDA) || defined(WITH_HIP)
at::Tensor nms_rotated_cuda(
const at::Tensor& dets,
const at::Tensor& scores,
const double iou_threshold);
#endif
// Interface for Python
// inline is needed to prevent multiple function definitions when this header is
// included by different cpps
inline at::Tensor nms_rotated(
const at::Tensor& dets,
const at::Tensor& scores,
const double iou_threshold) {
assert(dets.device().is_cuda() == scores.device().is_cuda());
if (dets.device().is_cuda()) {
#if defined(WITH_CUDA) || defined(WITH_HIP)
return nms_rotated_cuda(
dets.contiguous(), scores.contiguous(), iou_threshold);
#else
AT_ERROR("Detectron2 is not compiled with GPU support!");
#endif
}
return nms_rotated_cpu(dets.contiguous(), scores.contiguous(), iou_threshold);
}
} // namespace detectron2
// Copyright (c) Facebook, Inc. and its affiliates.
#include "../box_iou_rotated/box_iou_rotated_utils.h"
#include "nms_rotated.h"
namespace detectron2 {
template <typename scalar_t>
at::Tensor nms_rotated_cpu_kernel(
const at::Tensor& dets,
const at::Tensor& scores,
const double iou_threshold) {
// nms_rotated_cpu_kernel is modified from torchvision's nms_cpu_kernel,
// however, the code in this function is much shorter because
// we delegate the IoU computation for rotated boxes to
// the single_box_iou_rotated function in box_iou_rotated_utils.h
AT_ASSERTM(dets.device().is_cpu(), "dets must be a CPU tensor");
AT_ASSERTM(scores.device().is_cpu(), "scores must be a CPU tensor");
AT_ASSERTM(
dets.scalar_type() == scores.scalar_type(),
"dets should have the same type as scores");
if (dets.numel() == 0) {
return at::empty({0}, dets.options().dtype(at::kLong));
}
auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
auto ndets = dets.size(0);
at::Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte));
at::Tensor keep_t = at::zeros({ndets}, dets.options().dtype(at::kLong));
auto suppressed = suppressed_t.data_ptr<uint8_t>();
auto keep = keep_t.data_ptr<int64_t>();
auto order = order_t.data_ptr<int64_t>();
int64_t num_to_keep = 0;
for (int64_t _i = 0; _i < ndets; _i++) {
auto i = order[_i];
if (suppressed[i] == 1) {
continue;
}
keep[num_to_keep++] = i;
for (int64_t _j = _i + 1; _j < ndets; _j++) {
auto j = order[_j];
if (suppressed[j] == 1) {
continue;
}
auto ovr = single_box_iou_rotated<scalar_t>(
dets[i].data_ptr<scalar_t>(), dets[j].data_ptr<scalar_t>());
if (ovr >= iou_threshold) {
suppressed[j] = 1;
}
}
}
return keep_t.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep);
}
at::Tensor nms_rotated_cpu(
// input must be contiguous
const at::Tensor& dets,
const at::Tensor& scores,
const double iou_threshold) {
auto result = at::empty({0}, dets.options());
AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "nms_rotated", [&] {
result = nms_rotated_cpu_kernel<scalar_t>(dets, scores, iou_threshold);
});
return result;
}
} // namespace detectron2
// Copyright (c) Facebook, Inc. and its affiliates.
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <ATen/cuda/CUDAApplyUtils.cuh>
#ifdef WITH_CUDA
#include "../box_iou_rotated/box_iou_rotated_utils.h"
#endif
// TODO avoid this when pytorch supports "same directory" hipification
#ifdef WITH_HIP
#include "box_iou_rotated/box_iou_rotated_utils.h"
#endif
using namespace detectron2;
namespace {
int const threadsPerBlock = sizeof(unsigned long long) * 8;
}
template <typename T>
__global__ void nms_rotated_cuda_kernel(
const int n_boxes,
const double iou_threshold,
const T* dev_boxes,
unsigned long long* dev_mask) {
// nms_rotated_cuda_kernel is modified from torchvision's nms_cuda_kernel
const int row_start = blockIdx.y;
const int col_start = blockIdx.x;
// if (row_start > col_start) return;
const int row_size =
min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
const int col_size =
min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
// Compared to nms_cuda_kernel, where each box is represented with 4 values
// (x1, y1, x2, y2), each rotated box is represented with 5 values
// (x_center, y_center, width, height, angle_degrees) here.
__shared__ T block_boxes[threadsPerBlock * 5];
if (threadIdx.x < col_size) {
block_boxes[threadIdx.x * 5 + 0] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
block_boxes[threadIdx.x * 5 + 1] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
block_boxes[threadIdx.x * 5 + 2] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
block_boxes[threadIdx.x * 5 + 3] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
block_boxes[threadIdx.x * 5 + 4] =
dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
}
__syncthreads();
if (threadIdx.x < row_size) {
const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
const T* cur_box = dev_boxes + cur_box_idx * 5;
int i = 0;
unsigned long long t = 0;
int start = 0;
if (row_start == col_start) {
start = threadIdx.x + 1;
}
for (i = start; i < col_size; i++) {
// Instead of devIoU used by original horizontal nms, here
// we use the single_box_iou_rotated function from box_iou_rotated_utils.h
if (single_box_iou_rotated<T>(cur_box, block_boxes + i * 5) >
iou_threshold) {
t |= 1ULL << i;
}
}
const int col_blocks = at::cuda::ATenCeilDiv(n_boxes, threadsPerBlock);
dev_mask[cur_box_idx * col_blocks + col_start] = t;
}
}
namespace detectron2 {
at::Tensor nms_rotated_cuda(
// input must be contiguous
const at::Tensor& dets,
const at::Tensor& scores,
double iou_threshold) {
// using scalar_t = float;
AT_ASSERTM(dets.is_cuda(), "dets must be a CUDA tensor");
AT_ASSERTM(scores.is_cuda(), "scores must be a CUDA tensor");
at::cuda::CUDAGuard device_guard(dets.device());
auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
auto dets_sorted = dets.index_select(0, order_t);
auto dets_num = dets.size(0);
const int col_blocks =
at::cuda::ATenCeilDiv(static_cast<int>(dets_num), threadsPerBlock);
at::Tensor mask =
at::empty({dets_num * col_blocks}, dets.options().dtype(at::kLong));
dim3 blocks(col_blocks, col_blocks);
dim3 threads(threadsPerBlock);
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_DISPATCH_FLOATING_TYPES(
dets_sorted.scalar_type(), "nms_rotated_kernel_cuda", [&] {
nms_rotated_cuda_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
dets_num,
iou_threshold,
dets_sorted.data_ptr<scalar_t>(),
(unsigned long long*)mask.data_ptr<int64_t>());
});
at::Tensor mask_cpu = mask.to(at::kCPU);
unsigned long long* mask_host =
(unsigned long long*)mask_cpu.data_ptr<int64_t>();
std::vector<unsigned long long> remv(col_blocks);
memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
at::Tensor keep =
at::empty({dets_num}, dets.options().dtype(at::kLong).device(at::kCPU));
int64_t* keep_out = keep.data_ptr<int64_t>();
int num_to_keep = 0;
for (int i = 0; i < dets_num; i++) {
int nblock = i / threadsPerBlock;
int inblock = i % threadsPerBlock;
if (!(remv[nblock] & (1ULL << inblock))) {
keep_out[num_to_keep++] = i;
unsigned long long* p = mask_host + i * col_blocks;
for (int j = nblock; j < col_blocks; j++) {
remv[j] |= p[j];
}
}
}
AT_CUDA_CHECK(cudaGetLastError());
return order_t.index(
{keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep)
.to(order_t.device(), keep.scalar_type())});
}
} // namespace detectron2
// Copyright (c) Facebook, Inc. and its affiliates.
#include <torch/extension.h>
#include "ROIAlignRotated/ROIAlignRotated.h"
#include "box_iou_rotated/box_iou_rotated.h"
#include "cocoeval/cocoeval.h"
#include "deformable/deform_conv.h"
#include "nms_rotated/nms_rotated.h"
namespace detectron2 {
#if defined(WITH_CUDA) || defined(WITH_HIP)
extern int get_cudart_version();
#endif
std::string get_cuda_version() {
#if defined(WITH_CUDA) || defined(WITH_HIP)
std::ostringstream oss;
#if defined(WITH_CUDA)
oss << "CUDA ";
#else
oss << "HIP ";
#endif
// copied from
// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/cuda/detail/CUDAHooks.cpp#L231
auto printCudaStyleVersion = [&](int v) {
oss << (v / 1000) << "." << (v / 10 % 100);
if (v % 10 != 0) {
oss << "." << (v % 10);
}
};
printCudaStyleVersion(get_cudart_version());
return oss.str();
#else // neither CUDA nor HIP
return std::string("not available");
#endif
}
bool has_cuda() {
#if defined(WITH_CUDA)
return true;
#else
return false;
#endif
}
// similar to
// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Version.cpp
std::string get_compiler_version() {
std::ostringstream ss;
#if defined(__GNUC__)
#ifndef __clang__
#if ((__GNUC__ <= 4) && (__GNUC_MINOR__ <= 8))
#error "GCC >= 4.9 is required!"
#endif
{ ss << "GCC " << __GNUC__ << "." << __GNUC_MINOR__; }
#endif
#endif
#if defined(__clang_major__)
{
ss << "clang " << __clang_major__ << "." << __clang_minor__ << "."
<< __clang_patchlevel__;
}
#endif
#if defined(_MSC_VER)
{ ss << "MSVC " << _MSC_FULL_VER; }
#endif
return ss.str();
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("get_compiler_version", &get_compiler_version, "get_compiler_version");
m.def("get_cuda_version", &get_cuda_version, "get_cuda_version");
m.def("has_cuda", &has_cuda, "has_cuda");
m.def("deform_conv_forward", &deform_conv_forward, "deform_conv_forward");
m.def(
"deform_conv_backward_input",
&deform_conv_backward_input,
"deform_conv_backward_input");
m.def(
"deform_conv_backward_filter",
&deform_conv_backward_filter,
"deform_conv_backward_filter");
m.def(
"modulated_deform_conv_forward",
&modulated_deform_conv_forward,
"modulated_deform_conv_forward");
m.def(
"modulated_deform_conv_backward",
&modulated_deform_conv_backward,
"modulated_deform_conv_backward");
m.def("COCOevalAccumulate", &COCOeval::Accumulate, "COCOeval::Accumulate");
m.def(
"COCOevalEvaluateImages",
&COCOeval::EvaluateImages,
"COCOeval::EvaluateImages");
pybind11::class_<COCOeval::InstanceAnnotation>(m, "InstanceAnnotation")
.def(pybind11::init<uint64_t, double, double, bool, bool>());
pybind11::class_<COCOeval::ImageEvaluation>(m, "ImageEvaluation")
.def(pybind11::init<>());
}
TORCH_LIBRARY(detectron2, m) {
m.def("nms_rotated", &nms_rotated);
m.def("box_iou_rotated", &box_iou_rotated);
m.def("roi_align_rotated_forward", &ROIAlignRotated_forward);
m.def("roi_align_rotated_backward", &ROIAlignRotated_backward);
}
} // namespace detectron2
# Copyright (c) Facebook, Inc. and its affiliates.
import math
from functools import lru_cache
import torch
from torch import nn
from torch.autograd import Function
from torch.autograd.function import once_differentiable
from torch.nn.modules.utils import _pair
from torchvision.ops import deform_conv2d
from detectron2.utils.develop import create_dummy_class, create_dummy_func
from .wrappers import _NewEmptyTensorOp
class _DeformConv(Function):
@staticmethod
def forward(
ctx,
input,
offset,
weight,
stride=1,
padding=0,
dilation=1,
groups=1,
deformable_groups=1,
im2col_step=64,
):
if input is not None and input.dim() != 4:
raise ValueError(
"Expected 4D tensor as input, got {}D tensor instead.".format(input.dim())
)
ctx.stride = _pair(stride)
ctx.padding = _pair(padding)
ctx.dilation = _pair(dilation)
ctx.groups = groups
ctx.deformable_groups = deformable_groups
ctx.im2col_step = im2col_step
ctx.save_for_backward(input, offset, weight)
output = input.new_empty(
_DeformConv._output_size(input, weight, ctx.padding, ctx.dilation, ctx.stride)
)
ctx.bufs_ = [input.new_empty(0), input.new_empty(0)] # columns, ones
if not input.is_cuda:
# TODO: let torchvision support full features of our deformconv.
if deformable_groups != 1:
raise NotImplementedError(
"Deformable Conv with deformable_groups != 1 is not supported on CPUs!"
)
return deform_conv2d(
input, offset, weight, stride=stride, padding=padding, dilation=dilation
)
else:
cur_im2col_step = _DeformConv._cal_im2col_step(input.shape[0], ctx.im2col_step)
assert (input.shape[0] % cur_im2col_step) == 0, "im2col step must divide batchsize"
_C.deform_conv_forward(
input,
weight,
offset,
output,
ctx.bufs_[0],
ctx.bufs_[1],
weight.size(3),
weight.size(2),
ctx.stride[1],
ctx.stride[0],
ctx.padding[1],
ctx.padding[0],
ctx.dilation[1],
ctx.dilation[0],
ctx.groups,
ctx.deformable_groups,
cur_im2col_step,
)
return output
@staticmethod
@once_differentiable
def backward(ctx, grad_output):
input, offset, weight = ctx.saved_tensors
grad_input = grad_offset = grad_weight = None
if not grad_output.is_cuda:
raise NotImplementedError("Deformable Conv is not supported on CPUs!")
else:
cur_im2col_step = _DeformConv._cal_im2col_step(input.shape[0], ctx.im2col_step)
assert (input.shape[0] % cur_im2col_step) == 0, "im2col step must divide batchsize"
if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]:
grad_input = torch.zeros_like(input)
grad_offset = torch.zeros_like(offset)
_C.deform_conv_backward_input(
input,
offset,
grad_output,
grad_input,
grad_offset,
weight,
ctx.bufs_[0],
weight.size(3),
weight.size(2),
ctx.stride[1],
ctx.stride[0],
ctx.padding[1],
ctx.padding[0],
ctx.dilation[1],
ctx.dilation[0],
ctx.groups,
ctx.deformable_groups,
cur_im2col_step,
)
if ctx.needs_input_grad[2]:
grad_weight = torch.zeros_like(weight)
_C.deform_conv_backward_filter(
input,
offset,
grad_output,
grad_weight,
ctx.bufs_[0],
ctx.bufs_[1],
weight.size(3),
weight.size(2),
ctx.stride[1],
ctx.stride[0],
ctx.padding[1],
ctx.padding[0],
ctx.dilation[1],
ctx.dilation[0],
ctx.groups,
ctx.deformable_groups,
1,
cur_im2col_step,
)
return grad_input, grad_offset, grad_weight, None, None, None, None, None, None
@staticmethod
def _output_size(input, weight, padding, dilation, stride):
channels = weight.size(0)
output_size = (input.size(0), channels)
for d in range(input.dim() - 2):
in_size = input.size(d + 2)
pad = padding[d]
kernel = dilation[d] * (weight.size(d + 2) - 1) + 1
stride_ = stride[d]
output_size += ((in_size + (2 * pad) - kernel) // stride_ + 1,)
if not all(map(lambda s: s > 0, output_size)):
raise ValueError(
"convolution input is too small (output would be {})".format(
"x".join(map(str, output_size))
)
)
return output_size
@staticmethod
@lru_cache(maxsize=128)
def _cal_im2col_step(input_size, default_size):
"""
Calculate proper im2col step size, which should be divisible by input_size and not larger
than prefer_size. Meanwhile the step size should be as large as possible to be more
efficient. So we choose the largest one among all divisors of input_size which are smaller
than prefer_size.
:param input_size: input batch size .
:param default_size: default preferred im2col step size.
:return: the largest proper step size.
"""
if input_size <= default_size:
return input_size
best_step = 1
for step in range(2, min(int(math.sqrt(input_size)) + 1, default_size)):
if input_size % step == 0:
if input_size // step <= default_size:
return input_size // step
best_step = step
return best_step
class _ModulatedDeformConv(Function):
@staticmethod
def forward(
ctx,
input,
offset,
mask,
weight,
bias=None,
stride=1,
padding=0,
dilation=1,
groups=1,
deformable_groups=1,
):
ctx.stride = stride
ctx.padding = padding
ctx.dilation = dilation
ctx.groups = groups
ctx.deformable_groups = deformable_groups
ctx.with_bias = bias is not None
if not ctx.with_bias:
bias = input.new_empty(1) # fake tensor
if not input.is_cuda:
raise NotImplementedError("Deformable Conv is not supported on CPUs!")
if (
weight.requires_grad
or mask.requires_grad
or offset.requires_grad
or input.requires_grad
):
ctx.save_for_backward(input, offset, mask, weight, bias)
output = input.new_empty(_ModulatedDeformConv._infer_shape(ctx, input, weight))
ctx._bufs = [input.new_empty(0), input.new_empty(0)]
_C.modulated_deform_conv_forward(
input,
weight,
bias,
ctx._bufs[0],
offset,
mask,
output,
ctx._bufs[1],
weight.shape[2],
weight.shape[3],
ctx.stride,
ctx.stride,
ctx.padding,
ctx.padding,
ctx.dilation,
ctx.dilation,
ctx.groups,
ctx.deformable_groups,
ctx.with_bias,
)
return output
@staticmethod
@once_differentiable
def backward(ctx, grad_output):
if not grad_output.is_cuda:
raise NotImplementedError("Deformable Conv is not supported on CPUs!")
input, offset, mask, weight, bias = ctx.saved_tensors
grad_input = torch.zeros_like(input)
grad_offset = torch.zeros_like(offset)
grad_mask = torch.zeros_like(mask)
grad_weight = torch.zeros_like(weight)
grad_bias = torch.zeros_like(bias)
_C.modulated_deform_conv_backward(
input,
weight,
bias,
ctx._bufs[0],
offset,
mask,
ctx._bufs[1],
grad_input,
grad_weight,
grad_bias,
grad_offset,
grad_mask,
grad_output,
weight.shape[2],
weight.shape[3],
ctx.stride,
ctx.stride,
ctx.padding,
ctx.padding,
ctx.dilation,
ctx.dilation,
ctx.groups,
ctx.deformable_groups,
ctx.with_bias,
)
if not ctx.with_bias:
grad_bias = None
return (
grad_input,
grad_offset,
grad_mask,
grad_weight,
grad_bias,
None,
None,
None,
None,
None,
)
@staticmethod
def _infer_shape(ctx, input, weight):
n = input.size(0)
channels_out = weight.size(0)
height, width = input.shape[2:4]
kernel_h, kernel_w = weight.shape[2:4]
height_out = (
height + 2 * ctx.padding - (ctx.dilation * (kernel_h - 1) + 1)
) // ctx.stride + 1
width_out = (
width + 2 * ctx.padding - (ctx.dilation * (kernel_w - 1) + 1)
) // ctx.stride + 1
return n, channels_out, height_out, width_out
deform_conv = _DeformConv.apply
modulated_deform_conv = _ModulatedDeformConv.apply
class DeformConv(nn.Module):
def __init__(
self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
deformable_groups=1,
bias=False,
norm=None,
activation=None,
):
"""
Deformable convolution from :paper:`deformconv`.
Arguments are similar to :class:`Conv2D`. Extra arguments:
Args:
deformable_groups (int): number of groups used in deformable convolution.
norm (nn.Module, optional): a normalization layer
activation (callable(Tensor) -> Tensor): a callable activation function
"""
super(DeformConv, self).__init__()
assert not bias
assert in_channels % groups == 0, "in_channels {} cannot be divisible by groups {}".format(
in_channels, groups
)
assert (
out_channels % groups == 0
), "out_channels {} cannot be divisible by groups {}".format(out_channels, groups)
self.in_channels = in_channels
self.out_channels = out_channels
self.kernel_size = _pair(kernel_size)
self.stride = _pair(stride)
self.padding = _pair(padding)
self.dilation = _pair(dilation)
self.groups = groups
self.deformable_groups = deformable_groups
self.norm = norm
self.activation = activation
self.weight = nn.Parameter(
torch.Tensor(out_channels, in_channels // self.groups, *self.kernel_size)
)
self.bias = None
nn.init.kaiming_uniform_(self.weight, nonlinearity="relu")
def forward(self, x, offset):
if x.numel() == 0:
# When input is empty, we want to return a empty tensor with "correct" shape,
# So that the following operations will not panic
# if they check for the shape of the tensor.
# This computes the height and width of the output tensor
output_shape = [
(i + 2 * p - (di * (k - 1) + 1)) // s + 1
for i, p, di, k, s in zip(
x.shape[-2:], self.padding, self.dilation, self.kernel_size, self.stride
)
]
output_shape = [x.shape[0], self.weight.shape[0]] + output_shape
return _NewEmptyTensorOp.apply(x, output_shape)
x = deform_conv(
x,
offset,
self.weight,
self.stride,
self.padding,
self.dilation,
self.groups,
self.deformable_groups,
)
if self.norm is not None:
x = self.norm(x)
if self.activation is not None:
x = self.activation(x)
return x
def extra_repr(self):
tmpstr = "in_channels=" + str(self.in_channels)
tmpstr += ", out_channels=" + str(self.out_channels)
tmpstr += ", kernel_size=" + str(self.kernel_size)
tmpstr += ", stride=" + str(self.stride)
tmpstr += ", padding=" + str(self.padding)
tmpstr += ", dilation=" + str(self.dilation)
tmpstr += ", groups=" + str(self.groups)
tmpstr += ", deformable_groups=" + str(self.deformable_groups)
tmpstr += ", bias=False"
return tmpstr
class ModulatedDeformConv(nn.Module):
def __init__(
self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
deformable_groups=1,
bias=True,
norm=None,
activation=None,
):
"""
Modulated deformable convolution from :paper:`deformconv2`.
Arguments are similar to :class:`Conv2D`. Extra arguments:
Args:
deformable_groups (int): number of groups used in deformable convolution.
norm (nn.Module, optional): a normalization layer
activation (callable(Tensor) -> Tensor): a callable activation function
"""
super(ModulatedDeformConv, self).__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.kernel_size = _pair(kernel_size)
self.stride = stride
self.padding = padding
self.dilation = dilation
self.groups = groups
self.deformable_groups = deformable_groups
self.with_bias = bias
self.norm = norm
self.activation = activation
self.weight = nn.Parameter(
torch.Tensor(out_channels, in_channels // groups, *self.kernel_size)
)
if bias:
self.bias = nn.Parameter(torch.Tensor(out_channels))
else:
self.bias = None
nn.init.kaiming_uniform_(self.weight, nonlinearity="relu")
if self.bias is not None:
nn.init.constant_(self.bias, 0)
def forward(self, x, offset, mask):
if x.numel() == 0:
output_shape = [
(i + 2 * p - (di * (k - 1) + 1)) // s + 1
for i, p, di, k, s in zip(
x.shape[-2:], self.padding, self.dilation, self.kernel_size, self.stride
)
]
output_shape = [x.shape[0], self.weight.shape[0]] + output_shape
return _NewEmptyTensorOp.apply(x, output_shape)
x = modulated_deform_conv(
x,
offset,
mask,
self.weight,
self.bias,
self.stride,
self.padding,
self.dilation,
self.groups,
self.deformable_groups,
)
if self.norm is not None:
x = self.norm(x)
if self.activation is not None:
x = self.activation(x)
return x
def extra_repr(self):
tmpstr = "in_channels=" + str(self.in_channels)
tmpstr += ", out_channels=" + str(self.out_channels)
tmpstr += ", kernel_size=" + str(self.kernel_size)
tmpstr += ", stride=" + str(self.stride)
tmpstr += ", padding=" + str(self.padding)
tmpstr += ", dilation=" + str(self.dilation)
tmpstr += ", groups=" + str(self.groups)
tmpstr += ", deformable_groups=" + str(self.deformable_groups)
tmpstr += ", bias=" + str(self.with_bias)
return tmpstr
try:
from detectron2 import _C
except ImportError:
# TODO: register ops natively so there is no need to import _C.
_msg = "detectron2 is not compiled successfully, please build following the instructions!"
_args = ("detectron2._C", _msg)
DeformConv = create_dummy_class("DeformConv", *_args)
ModulatedDeformConv = create_dummy_class("ModulatedDeformConv", *_args)
deform_conv = create_dummy_func("deform_conv", *_args)
modulated_deform_conv = create_dummy_func("modulated_deform_conv", *_args)
import math
import torch
def diou_loss(
boxes1: torch.Tensor,
boxes2: torch.Tensor,
reduction: str = "none",
eps: float = 1e-7,
) -> torch.Tensor:
"""
Distance Intersection over Union Loss (Zhaohui Zheng et. al)
https://arxiv.org/abs/1911.08287
Args:
boxes1, boxes2 (Tensor): box locations in XYXY format, shape (N, 4) or (4,).
reduction: 'none' | 'mean' | 'sum'
'none': No reduction will be applied to the output.
'mean': The output will be averaged.
'sum': The output will be summed.
eps (float): small number to prevent division by zero
"""
x1, y1, x2, y2 = boxes1.unbind(dim=-1)
x1g, y1g, x2g, y2g = boxes2.unbind(dim=-1)
# TODO: use torch._assert_async() when pytorch 1.8 support is dropped
assert (x2 >= x1).all(), "bad box: x1 larger than x2"
assert (y2 >= y1).all(), "bad box: y1 larger than y2"
# Intersection keypoints
xkis1 = torch.max(x1, x1g)
ykis1 = torch.max(y1, y1g)
xkis2 = torch.min(x2, x2g)
ykis2 = torch.min(y2, y2g)
intsct = torch.zeros_like(x1)
mask = (ykis2 > ykis1) & (xkis2 > xkis1)
intsct[mask] = (xkis2[mask] - xkis1[mask]) * (ykis2[mask] - ykis1[mask])
union = (x2 - x1) * (y2 - y1) + (x2g - x1g) * (y2g - y1g) - intsct + eps
iou = intsct / union
# smallest enclosing box
xc1 = torch.min(x1, x1g)
yc1 = torch.min(y1, y1g)
xc2 = torch.max(x2, x2g)
yc2 = torch.max(y2, y2g)
diag_len = ((xc2 - xc1) ** 2) + ((yc2 - yc1) ** 2) + eps
# centers of boxes
x_p = (x2 + x1) / 2
y_p = (y2 + y1) / 2
x_g = (x1g + x2g) / 2
y_g = (y1g + y2g) / 2
distance = ((x_p - x_g) ** 2) + ((y_p - y_g) ** 2)
# Eqn. (7)
loss = 1 - iou + (distance / diag_len)
if reduction == "mean":
loss = loss.mean() if loss.numel() > 0 else 0.0 * loss.sum()
elif reduction == "sum":
loss = loss.sum()
return loss
def ciou_loss(
boxes1: torch.Tensor,
boxes2: torch.Tensor,
reduction: str = "none",
eps: float = 1e-7,
) -> torch.Tensor:
"""
Complete Intersection over Union Loss (Zhaohui Zheng et. al)
https://arxiv.org/abs/1911.08287
Args:
boxes1, boxes2 (Tensor): box locations in XYXY format, shape (N, 4) or (4,).
reduction: 'none' | 'mean' | 'sum'
'none': No reduction will be applied to the output.
'mean': The output will be averaged.
'sum': The output will be summed.
eps (float): small number to prevent division by zero
"""
x1, y1, x2, y2 = boxes1.unbind(dim=-1)
x1g, y1g, x2g, y2g = boxes2.unbind(dim=-1)
# TODO: use torch._assert_async() when pytorch 1.8 support is dropped
assert (x2 >= x1).all(), "bad box: x1 larger than x2"
assert (y2 >= y1).all(), "bad box: y1 larger than y2"
# Intersection keypoints
xkis1 = torch.max(x1, x1g)
ykis1 = torch.max(y1, y1g)
xkis2 = torch.min(x2, x2g)
ykis2 = torch.min(y2, y2g)
intsct = torch.zeros_like(x1)
mask = (ykis2 > ykis1) & (xkis2 > xkis1)
intsct[mask] = (xkis2[mask] - xkis1[mask]) * (ykis2[mask] - ykis1[mask])
union = (x2 - x1) * (y2 - y1) + (x2g - x1g) * (y2g - y1g) - intsct + eps
iou = intsct / union
# smallest enclosing box
xc1 = torch.min(x1, x1g)
yc1 = torch.min(y1, y1g)
xc2 = torch.max(x2, x2g)
yc2 = torch.max(y2, y2g)
diag_len = ((xc2 - xc1) ** 2) + ((yc2 - yc1) ** 2) + eps
# centers of boxes
x_p = (x2 + x1) / 2
y_p = (y2 + y1) / 2
x_g = (x1g + x2g) / 2
y_g = (y1g + y2g) / 2
distance = ((x_p - x_g) ** 2) + ((y_p - y_g) ** 2)
# width and height of boxes
w_pred = x2 - x1
h_pred = y2 - y1
w_gt = x2g - x1g
h_gt = y2g - y1g
v = (4 / (math.pi**2)) * torch.pow((torch.atan(w_gt / h_gt) - torch.atan(w_pred / h_pred)), 2)
with torch.no_grad():
alpha = v / (1 - iou + v + eps)
# Eqn. (10)
loss = 1 - iou + (distance / diag_len) + alpha * v
if reduction == "mean":
loss = loss.mean() if loss.numel() > 0 else 0.0 * loss.sum()
elif reduction == "sum":
loss = loss.sum()
return loss
# Copyright (c) Facebook, Inc. and its affiliates.
import numpy as np
from typing import Tuple
import torch
from PIL import Image
from torch.nn import functional as F
__all__ = ["paste_masks_in_image"]
BYTES_PER_FLOAT = 4
# TODO: This memory limit may be too much or too little. It would be better to
# determine it based on available resources.
GPU_MEM_LIMIT = 1024**3 # 1 GB memory limit
def _do_paste_mask(masks, boxes, img_h: int, img_w: int, skip_empty: bool = True):
"""
Args:
masks: N, 1, H, W
boxes: N, 4
img_h, img_w (int):
skip_empty (bool): only paste masks within the region that
tightly bound all boxes, and returns the results this region only.
An important optimization for CPU.
Returns:
if skip_empty == False, a mask of shape (N, img_h, img_w)
if skip_empty == True, a mask of shape (N, h', w'), and the slice
object for the corresponding region.
"""
# On GPU, paste all masks together (up to chunk size)
# by using the entire image to sample the masks
# Compared to pasting them one by one,
# this has more operations but is faster on COCO-scale dataset.
device = masks.device
if skip_empty and not torch.jit.is_scripting():
x0_int, y0_int = torch.clamp(boxes.min(dim=0).values.floor()[:2] - 1, min=0).to(
dtype=torch.int32
)
x1_int = torch.clamp(boxes[:, 2].max().ceil() + 1, max=img_w).to(dtype=torch.int32)
y1_int = torch.clamp(boxes[:, 3].max().ceil() + 1, max=img_h).to(dtype=torch.int32)
else:
x0_int, y0_int = 0, 0
x1_int, y1_int = img_w, img_h
x0, y0, x1, y1 = torch.split(boxes, 1, dim=1) # each is Nx1
N = masks.shape[0]
img_y = torch.arange(y0_int, y1_int, device=device, dtype=torch.float32) + 0.5
img_x = torch.arange(x0_int, x1_int, device=device, dtype=torch.float32) + 0.5
img_y = (img_y - y0) / (y1 - y0) * 2 - 1
img_x = (img_x - x0) / (x1 - x0) * 2 - 1
# img_x, img_y have shapes (N, w), (N, h)
gx = img_x[:, None, :].expand(N, img_y.size(1), img_x.size(1))
gy = img_y[:, :, None].expand(N, img_y.size(1), img_x.size(1))
grid = torch.stack([gx, gy], dim=3)
if not torch.jit.is_scripting():
if not masks.dtype.is_floating_point:
masks = masks.float()
img_masks = F.grid_sample(masks, grid.to(masks.dtype), align_corners=False)
if skip_empty and not torch.jit.is_scripting():
return img_masks[:, 0], (slice(y0_int, y1_int), slice(x0_int, x1_int))
else:
return img_masks[:, 0], ()
# Annotate boxes as Tensor (but not Boxes) in order to use scripting
@torch.jit.script_if_tracing
def paste_masks_in_image(
masks: torch.Tensor, boxes: torch.Tensor, image_shape: Tuple[int, int], threshold: float = 0.5
):
"""
Paste a set of masks that are of a fixed resolution (e.g., 28 x 28) into an image.
The location, height, and width for pasting each mask is determined by their
corresponding bounding boxes in boxes.
Note:
This is a complicated but more accurate implementation. In actual deployment, it is
often enough to use a faster but less accurate implementation.
See :func:`paste_mask_in_image_old` in this file for an alternative implementation.
Args:
masks (tensor): Tensor of shape (Bimg, Hmask, Wmask), where Bimg is the number of
detected object instances in the image and Hmask, Wmask are the mask width and mask
height of the predicted mask (e.g., Hmask = Wmask = 28). Values are in [0, 1].
boxes (Boxes or Tensor): A Boxes of length Bimg or Tensor of shape (Bimg, 4).
boxes[i] and masks[i] correspond to the same object instance.
image_shape (tuple): height, width
threshold (float): A threshold in [0, 1] for converting the (soft) masks to
binary masks.
Returns:
img_masks (Tensor): A tensor of shape (Bimg, Himage, Wimage), where Bimg is the
number of detected object instances and Himage, Wimage are the image width
and height. img_masks[i] is a binary mask for object instance i.
"""
assert masks.shape[-1] == masks.shape[-2], "Only square mask predictions are supported"
N = len(masks)
if N == 0:
return masks.new_empty((0,) + image_shape, dtype=torch.uint8)
if not isinstance(boxes, torch.Tensor):
boxes = boxes.tensor
device = boxes.device
assert len(boxes) == N, boxes.shape
img_h, img_w = image_shape
# The actual implementation split the input into chunks,
# and paste them chunk by chunk.
if device.type == "cpu" or torch.jit.is_scripting():
# CPU is most efficient when they are pasted one by one with skip_empty=True
# so that it performs minimal number of operations.
num_chunks = N
else:
# GPU benefits from parallelism for larger chunks, but may have memory issue
# int(img_h) because shape may be tensors in tracing
num_chunks = int(np.ceil(N * int(img_h) * int(img_w) * BYTES_PER_FLOAT / GPU_MEM_LIMIT))
assert (
num_chunks <= N
), "Default GPU_MEM_LIMIT in mask_ops.py is too small; try increasing it"
chunks = torch.chunk(torch.arange(N, device=device), num_chunks)
img_masks = torch.zeros(
N, img_h, img_w, device=device, dtype=torch.bool if threshold >= 0 else torch.uint8
)
for inds in chunks:
masks_chunk, spatial_inds = _do_paste_mask(
masks[inds, None, :, :], boxes[inds], img_h, img_w, skip_empty=device.type == "cpu"
)
if threshold >= 0:
masks_chunk = (masks_chunk >= threshold).to(dtype=torch.bool)
else:
# for visualization and debugging
masks_chunk = (masks_chunk * 255).to(dtype=torch.uint8)
if torch.jit.is_scripting(): # Scripting does not use the optimized codepath
img_masks[inds] = masks_chunk
else:
img_masks[(inds,) + spatial_inds] = masks_chunk
return img_masks
# The below are the original paste function (from Detectron1) which has
# larger quantization error.
# It is faster on CPU, while the aligned one is faster on GPU thanks to grid_sample.
def paste_mask_in_image_old(mask, box, img_h, img_w, threshold):
"""
Paste a single mask in an image.
This is a per-box implementation of :func:`paste_masks_in_image`.
This function has larger quantization error due to incorrect pixel
modeling and is not used any more.
Args:
mask (Tensor): A tensor of shape (Hmask, Wmask) storing the mask of a single
object instance. Values are in [0, 1].
box (Tensor): A tensor of shape (4, ) storing the x0, y0, x1, y1 box corners
of the object instance.
img_h, img_w (int): Image height and width.
threshold (float): Mask binarization threshold in [0, 1].
Returns:
im_mask (Tensor):
The resized and binarized object mask pasted into the original
image plane (a tensor of shape (img_h, img_w)).
"""
# Conversion from continuous box coordinates to discrete pixel coordinates
# via truncation (cast to int32). This determines which pixels to paste the
# mask onto.
box = box.to(dtype=torch.int32) # Continuous to discrete coordinate conversion
# An example (1D) box with continuous coordinates (x0=0.7, x1=4.3) will map to
# a discrete coordinates (x0=0, x1=4). Note that box is mapped to 5 = x1 - x0 + 1
# pixels (not x1 - x0 pixels).
samples_w = box[2] - box[0] + 1 # Number of pixel samples, *not* geometric width
samples_h = box[3] - box[1] + 1 # Number of pixel samples, *not* geometric height
# Resample the mask from it's original grid to the new samples_w x samples_h grid
mask = Image.fromarray(mask.cpu().numpy())
mask = mask.resize((samples_w, samples_h), resample=Image.BILINEAR)
mask = np.asarray(mask)
if threshold >= 0:
mask = np.array(mask > threshold, dtype=np.uint8)
mask = torch.from_numpy(mask)
else:
# for visualization and debugging, we also
# allow it to return an unmodified mask
mask = torch.from_numpy(mask * 255).to(torch.uint8)
im_mask = torch.zeros((img_h, img_w), dtype=torch.uint8)
x_0 = max(box[0], 0)
x_1 = min(box[2] + 1, img_w)
y_0 = max(box[1], 0)
y_1 = min(box[3] + 1, img_h)
im_mask[y_0:y_1, x_0:x_1] = mask[
(y_0 - box[1]) : (y_1 - box[1]), (x_0 - box[0]) : (x_1 - box[0])
]
return im_mask
# Our pixel modeling requires extrapolation for any continuous
# coordinate < 0.5 or > length - 0.5. When sampling pixels on the masks,
# we would like this extrapolation to be an interpolation between boundary values and zero,
# instead of using absolute zero or boundary values.
# Therefore `paste_mask_in_image_old` is often used with zero padding around the masks like this:
# masks, scale = pad_masks(masks[:, 0, :, :], 1)
# boxes = scale_boxes(boxes.tensor, scale)
def pad_masks(masks, padding):
"""
Args:
masks (tensor): A tensor of shape (B, M, M) representing B masks.
padding (int): Number of cells to pad on all sides.
Returns:
The padded masks and the scale factor of the padding size / original size.
"""
B = masks.shape[0]
M = masks.shape[-1]
pad2 = 2 * padding
scale = float(M + pad2) / M
padded_masks = masks.new_zeros((B, M + pad2, M + pad2))
padded_masks[:, padding:-padding, padding:-padding] = masks
return padded_masks, scale
def scale_boxes(boxes, scale):
"""
Args:
boxes (tensor): A tensor of shape (B, 4) representing B boxes with 4
coords representing the corners x0, y0, x1, y1,
scale (float): The box scaling factor.
Returns:
Scaled boxes.
"""
w_half = (boxes[:, 2] - boxes[:, 0]) * 0.5
h_half = (boxes[:, 3] - boxes[:, 1]) * 0.5
x_c = (boxes[:, 2] + boxes[:, 0]) * 0.5
y_c = (boxes[:, 3] + boxes[:, 1]) * 0.5
w_half *= scale
h_half *= scale
scaled_boxes = torch.zeros_like(boxes)
scaled_boxes[:, 0] = x_c - w_half
scaled_boxes[:, 2] = x_c + w_half
scaled_boxes[:, 1] = y_c - h_half
scaled_boxes[:, 3] = y_c + h_half
return scaled_boxes
@torch.jit.script_if_tracing
def _paste_masks_tensor_shape(
masks: torch.Tensor,
boxes: torch.Tensor,
image_shape: Tuple[torch.Tensor, torch.Tensor],
threshold: float = 0.5,
):
"""
A wrapper of paste_masks_in_image where image_shape is Tensor.
During tracing, shapes might be tensors instead of ints. The Tensor->int
conversion should be scripted rather than traced.
"""
return paste_masks_in_image(masks, boxes, (int(image_shape[0]), int(image_shape[1])), threshold)
# -*- coding: utf-8 -*-
# Copyright (c) Facebook, Inc. and its affiliates.
import torch
from torchvision.ops import boxes as box_ops
from torchvision.ops import nms # noqa . for compatibility
from detectron2.layers.wrappers import disable_torch_compiler
def batched_nms(
boxes: torch.Tensor, scores: torch.Tensor, idxs: torch.Tensor, iou_threshold: float
):
"""
Same as torchvision.ops.boxes.batched_nms, but with float().
"""
assert boxes.shape[-1] == 4
# Note: Torchvision already has a strategy (https://github.com/pytorch/vision/issues/1311)
# to decide whether to use coordinate trick or for loop to implement batched_nms. So we
# just call it directly.
# Fp16 does not have enough range for batched NMS, so adding float().
return box_ops.batched_nms(boxes.float(), scores, idxs, iou_threshold)
# Note: this function (nms_rotated) might be moved into
# torchvision/ops/boxes.py in the future
@disable_torch_compiler
def nms_rotated(boxes: torch.Tensor, scores: torch.Tensor, iou_threshold: float):
"""
Performs non-maximum suppression (NMS) on the rotated boxes according
to their intersection-over-union (IoU).
Rotated NMS iteratively removes lower scoring rotated boxes which have an
IoU greater than iou_threshold with another (higher scoring) rotated box.
Note that RotatedBox (5, 3, 4, 2, -90) covers exactly the same region as
RotatedBox (5, 3, 4, 2, 90) does, and their IoU will be 1. However, they
can be representing completely different objects in certain tasks, e.g., OCR.
As for the question of whether rotated-NMS should treat them as faraway boxes
even though their IOU is 1, it depends on the application and/or ground truth annotation.
As an extreme example, consider a single character v and the square box around it.
If the angle is 0 degree, the object (text) would be read as 'v';
If the angle is 90 degrees, the object (text) would become '>';
If the angle is 180 degrees, the object (text) would become '^';
If the angle is 270/-90 degrees, the object (text) would become '<'
All of these cases have IoU of 1 to each other, and rotated NMS that only
uses IoU as criterion would only keep one of them with the highest score -
which, practically, still makes sense in most cases because typically
only one of theses orientations is the correct one. Also, it does not matter
as much if the box is only used to classify the object (instead of transcribing
them with a sequential OCR recognition model) later.
On the other hand, when we use IoU to filter proposals that are close to the
ground truth during training, we should definitely take the angle into account if
we know the ground truth is labeled with the strictly correct orientation (as in,
upside-down words are annotated with -180 degrees even though they can be covered
with a 0/90/-90 degree box, etc.)
The way the original dataset is annotated also matters. For example, if the dataset
is a 4-point polygon dataset that does not enforce ordering of vertices/orientation,
we can estimate a minimum rotated bounding box to this polygon, but there's no way
we can tell the correct angle with 100% confidence (as shown above, there could be 4 different
rotated boxes, with angles differed by 90 degrees to each other, covering the exactly
same region). In that case we have to just use IoU to determine the box
proximity (as many detection benchmarks (even for text) do) unless there're other
assumptions we can make (like width is always larger than height, or the object is not
rotated by more than 90 degrees CCW/CW, etc.)
In summary, not considering angles in rotated NMS seems to be a good option for now,
but we should be aware of its implications.
Args:
boxes (Tensor[N, 5]): Rotated boxes to perform NMS on. They are expected to be in
(x_center, y_center, width, height, angle_degrees) format.
scores (Tensor[N]): Scores for each one of the rotated boxes
iou_threshold (float): Discards all overlapping rotated boxes with IoU < iou_threshold
Returns:
keep (Tensor): int64 tensor with the indices of the elements that have been kept
by Rotated NMS, sorted in decreasing order of scores
"""
return torch.ops.detectron2.nms_rotated(boxes, scores, iou_threshold)
# Note: this function (batched_nms_rotated) might be moved into
# torchvision/ops/boxes.py in the future
@torch.jit.script_if_tracing
def batched_nms_rotated(
boxes: torch.Tensor, scores: torch.Tensor, idxs: torch.Tensor, iou_threshold: float
):
"""
Performs non-maximum suppression in a batched fashion.
Each index value correspond to a category, and NMS
will not be applied between elements of different categories.
Args:
boxes (Tensor[N, 5]):
boxes where NMS will be performed. They
are expected to be in (x_ctr, y_ctr, width, height, angle_degrees) format
scores (Tensor[N]):
scores for each one of the boxes
idxs (Tensor[N]):
indices of the categories for each one of the boxes.
iou_threshold (float):
discards all overlapping boxes
with IoU < iou_threshold
Returns:
Tensor:
int64 tensor with the indices of the elements that have been kept
by NMS, sorted in decreasing order of scores
"""
assert boxes.shape[-1] == 5
if boxes.numel() == 0:
return torch.empty((0,), dtype=torch.int64, device=boxes.device)
boxes = boxes.float() # fp16 does not have enough range for batched NMS
# Strategy: in order to perform NMS independently per class,
# we add an offset to all the boxes. The offset is dependent
# only on the class idx, and is large enough so that boxes
# from different classes do not overlap
# Note that batched_nms in torchvision/ops/boxes.py only uses max_coordinate,
# which won't handle negative coordinates correctly.
# Here by using min_coordinate we can make sure the negative coordinates are
# correctly handled.
max_coordinate = (
torch.max(boxes[:, 0], boxes[:, 1]) + torch.max(boxes[:, 2], boxes[:, 3]) / 2
).max()
min_coordinate = (
torch.min(boxes[:, 0], boxes[:, 1]) - torch.max(boxes[:, 2], boxes[:, 3]) / 2
).min()
offsets = idxs.to(boxes) * (max_coordinate - min_coordinate + 1)
boxes_for_nms = boxes.clone() # avoid modifying the original values in boxes
boxes_for_nms[:, :2] += offsets[:, None]
keep = nms_rotated(boxes_for_nms, scores, iou_threshold)
return keep
# Copyright (c) Facebook, Inc. and its affiliates.
from torch import nn
from torchvision.ops import roi_align
# NOTE: torchvision's RoIAlign has a different default aligned=False
class ROIAlign(nn.Module):
def __init__(self, output_size, spatial_scale, sampling_ratio, aligned=True):
"""
Args:
output_size (tuple): h, w
spatial_scale (float): scale the input boxes by this number
sampling_ratio (int): number of inputs samples to take for each output
sample. 0 to take samples densely.
aligned (bool): if False, use the legacy implementation in
Detectron. If True, align the results more perfectly.
Note:
The meaning of aligned=True:
Given a continuous coordinate c, its two neighboring pixel indices (in our
pixel model) are computed by floor(c - 0.5) and ceil(c - 0.5). For example,
c=1.3 has pixel neighbors with discrete indices [0] and [1] (which are sampled
from the underlying signal at continuous coordinates 0.5 and 1.5). But the original
roi_align (aligned=False) does not subtract the 0.5 when computing neighboring
pixel indices and therefore it uses pixels with a slightly incorrect alignment
(relative to our pixel model) when performing bilinear interpolation.
With `aligned=True`,
we first appropriately scale the ROI and then shift it by -0.5
prior to calling roi_align. This produces the correct neighbors; see
detectron2/tests/test_roi_align.py for verification.
The difference does not make a difference to the model's performance if
ROIAlign is used together with conv layers.
"""
super().__init__()
self.output_size = output_size
self.spatial_scale = spatial_scale
self.sampling_ratio = sampling_ratio
self.aligned = aligned
from torchvision import __version__
version = tuple(int(x) for x in __version__.split(".")[:2])
# https://github.com/pytorch/vision/pull/2438
assert version >= (0, 7), "Require torchvision >= 0.7"
def forward(self, input, rois):
"""
Args:
input: NCHW images
rois: Bx5 boxes. First column is the index into N. The other 4 columns are xyxy.
"""
assert rois.dim() == 2 and rois.size(1) == 5
if input.is_quantized:
input = input.dequantize()
return roi_align(
input,
rois.to(dtype=input.dtype),
self.output_size,
self.spatial_scale,
self.sampling_ratio,
self.aligned,
)
def __repr__(self):
tmpstr = self.__class__.__name__ + "("
tmpstr += "output_size=" + str(self.output_size)
tmpstr += ", spatial_scale=" + str(self.spatial_scale)
tmpstr += ", sampling_ratio=" + str(self.sampling_ratio)
tmpstr += ", aligned=" + str(self.aligned)
tmpstr += ")"
return tmpstr
# Copyright (c) Facebook, Inc. and its affiliates.
import torch
from torch import nn
from torch.autograd import Function
from torch.autograd.function import once_differentiable
from torch.nn.modules.utils import _pair
from detectron2.layers.wrappers import disable_torch_compiler
class _ROIAlignRotated(Function):
@staticmethod
@disable_torch_compiler
def forward(ctx, input, roi, output_size, spatial_scale, sampling_ratio):
ctx.save_for_backward(roi)
ctx.output_size = _pair(output_size)
ctx.spatial_scale = spatial_scale
ctx.sampling_ratio = sampling_ratio
ctx.input_shape = input.size()
output = torch.ops.detectron2.roi_align_rotated_forward(
input, roi, spatial_scale, output_size[0], output_size[1], sampling_ratio
)
return output
@staticmethod
@once_differentiable
def backward(ctx, grad_output):
(rois,) = ctx.saved_tensors
output_size = ctx.output_size
spatial_scale = ctx.spatial_scale
sampling_ratio = ctx.sampling_ratio
bs, ch, h, w = ctx.input_shape
grad_input = torch.ops.detectron2.roi_align_rotated_backward(
grad_output,
rois,
spatial_scale,
output_size[0],
output_size[1],
bs,
ch,
h,
w,
sampling_ratio,
)
return grad_input, None, None, None, None, None
roi_align_rotated = _ROIAlignRotated.apply
class ROIAlignRotated(nn.Module):
def __init__(self, output_size, spatial_scale, sampling_ratio):
"""
Args:
output_size (tuple): h, w
spatial_scale (float): scale the input boxes by this number
sampling_ratio (int): number of inputs samples to take for each output
sample. 0 to take samples densely.
Note:
ROIAlignRotated supports continuous coordinate by default:
Given a continuous coordinate c, its two neighboring pixel indices (in our
pixel model) are computed by floor(c - 0.5) and ceil(c - 0.5). For example,
c=1.3 has pixel neighbors with discrete indices [0] and [1] (which are sampled
from the underlying signal at continuous coordinates 0.5 and 1.5).
"""
super(ROIAlignRotated, self).__init__()
self.output_size = output_size
self.spatial_scale = spatial_scale
self.sampling_ratio = sampling_ratio
def forward(self, input, rois):
"""
Args:
input: NCHW images
rois: Bx6 boxes. First column is the index into N.
The other 5 columns are (x_ctr, y_ctr, width, height, angle_degrees).
"""
assert rois.dim() == 2 and rois.size(1) == 6
orig_dtype = input.dtype
if orig_dtype == torch.float16:
input = input.float()
rois = rois.float()
output_size = _pair(self.output_size)
# Scripting for Autograd is currently unsupported.
# This is a quick fix without having to rewrite code on the C++ side
if torch.jit.is_scripting() or torch.jit.is_tracing():
return torch.ops.detectron2.roi_align_rotated_forward(
input, rois, self.spatial_scale, output_size[0], output_size[1], self.sampling_ratio
).to(dtype=orig_dtype)
return roi_align_rotated(
input, rois, self.output_size, self.spatial_scale, self.sampling_ratio
).to(dtype=orig_dtype)
def __repr__(self):
tmpstr = self.__class__.__name__ + "("
tmpstr += "output_size=" + str(self.output_size)
tmpstr += ", spatial_scale=" + str(self.spatial_scale)
tmpstr += ", sampling_ratio=" + str(self.sampling_ratio)
tmpstr += ")"
return tmpstr
# Copyright (c) Facebook, Inc. and its affiliates.
from __future__ import absolute_import, division, print_function, unicode_literals
import torch
def pairwise_iou_rotated(boxes1, boxes2):
"""
Return intersection-over-union (Jaccard index) of boxes.
Both sets of boxes are expected to be in
(x_center, y_center, width, height, angle) format.
Arguments:
boxes1 (Tensor[N, 5])
boxes2 (Tensor[M, 5])
Returns:
iou (Tensor[N, M]): the NxM matrix containing the pairwise
IoU values for every element in boxes1 and boxes2
"""
return torch.ops.detectron2.box_iou_rotated(boxes1, boxes2)
# -*- coding: utf-8 -*-
# Copyright (c) Facebook, Inc. and its affiliates.
from dataclasses import dataclass
from typing import Optional
@dataclass
class ShapeSpec:
"""
A simple structure that contains basic shape specification about a tensor.
It is often used as the auxiliary inputs/outputs of models,
to complement the lack of shape inference ability among pytorch modules.
"""
channels: Optional[int] = None
height: Optional[int] = None
width: Optional[int] = None
stride: Optional[int] = None
# Copyright (c) Facebook, Inc. and its affiliates.
"""
Wrappers around on some nn functions, mainly to support empty tensors.
Ideally, add support directly in PyTorch to empty tensors in those functions.
These can be removed once https://github.com/pytorch/pytorch/issues/12013
is implemented
"""
import functools
import warnings
from typing import List, Optional
import torch
from torch.nn import functional as F
from detectron2.utils.env import TORCH_VERSION
def shapes_to_tensor(x: List[int], device: Optional[torch.device] = None) -> torch.Tensor:
"""
Turn a list of integer scalars or integer Tensor scalars into a vector,
in a way that's both traceable and scriptable.
In tracing, `x` should be a list of scalar Tensor, so the output can trace to the inputs.
In scripting or eager, `x` should be a list of int.
"""
if torch.jit.is_scripting():
return torch.as_tensor(x, device=device)
if torch.jit.is_tracing():
assert all(
[isinstance(t, torch.Tensor) for t in x]
), "Shape should be tensor during tracing!"
# as_tensor should not be used in tracing because it records a constant
ret = torch.stack(x)
if ret.device != device: # avoid recording a hard-coded device if not necessary
ret = ret.to(device=device)
return ret
return torch.as_tensor(x, device=device)
def check_if_dynamo_compiling():
if TORCH_VERSION >= (2, 1):
from torch._dynamo import is_compiling
return is_compiling()
else:
return False
def disable_torch_compiler(func):
if TORCH_VERSION >= (2, 1):
# Use the torch.compiler.disable decorator if supported
@torch.compiler.disable
@functools.wraps(func)
def wrapper(*args, **kwargs):
return func(*args, **kwargs)
return wrapper
else:
# Return the function unchanged if torch.compiler.disable is not supported
return func
def cat(tensors: List[torch.Tensor], dim: int = 0):
"""
Efficient version of torch.cat that avoids a copy if there is only a single element in a list
"""
assert isinstance(tensors, (list, tuple))
if len(tensors) == 1:
return tensors[0]
return torch.cat(tensors, dim)
def empty_input_loss_func_wrapper(loss_func):
def wrapped_loss_func(input, target, *, reduction="mean", **kwargs):
"""
Same as `loss_func`, but returns 0 (instead of nan) for empty inputs.
"""
if target.numel() == 0 and reduction == "mean":
return input.sum() * 0.0 # connect the gradient
return loss_func(input, target, reduction=reduction, **kwargs)
return wrapped_loss_func
cross_entropy = empty_input_loss_func_wrapper(F.cross_entropy)
class _NewEmptyTensorOp(torch.autograd.Function):
@staticmethod
def forward(ctx, x, new_shape):
ctx.shape = x.shape
return x.new_empty(new_shape)
@staticmethod
def backward(ctx, grad):
shape = ctx.shape
return _NewEmptyTensorOp.apply(grad, shape), None
class Conv2d(torch.nn.Conv2d):
"""
A wrapper around :class:`torch.nn.Conv2d` to support empty inputs and more features.
"""
def __init__(self, *args, **kwargs):
"""
Extra keyword arguments supported in addition to those in `torch.nn.Conv2d`:
Args:
norm (nn.Module, optional): a normalization layer
activation (callable(Tensor) -> Tensor): a callable activation function
It assumes that norm layer is used before activation.
"""
norm = kwargs.pop("norm", None)
activation = kwargs.pop("activation", None)
super().__init__(*args, **kwargs)
self.norm = norm
self.activation = activation
def forward(self, x):
# torchscript does not support SyncBatchNorm yet
# https://github.com/pytorch/pytorch/issues/40507
# and we skip these codes in torchscript since:
# 1. currently we only support torchscript in evaluation mode
# 2. features needed by exporting module to torchscript are added in PyTorch 1.6 or
# later version, `Conv2d` in these PyTorch versions has already supported empty inputs.
if not torch.jit.is_scripting():
# Dynamo doesn't support context managers yet
is_dynamo_compiling = check_if_dynamo_compiling()
if not is_dynamo_compiling:
with warnings.catch_warnings(record=True):
if x.numel() == 0 and self.training:
# https://github.com/pytorch/pytorch/issues/12013
assert not isinstance(
self.norm, torch.nn.SyncBatchNorm
), "SyncBatchNorm does not support empty inputs!"
x = F.conv2d(
x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups
)
if self.norm is not None:
x = self.norm(x)
if self.activation is not None:
x = self.activation(x)
return x
ConvTranspose2d = torch.nn.ConvTranspose2d
BatchNorm2d = torch.nn.BatchNorm2d
interpolate = F.interpolate
Linear = torch.nn.Linear
def nonzero_tuple(x):
"""
A 'as_tuple=True' version of torch.nonzero to support torchscript.
because of https://github.com/pytorch/pytorch/issues/38718
"""
if torch.jit.is_scripting():
if x.dim() == 0:
return x.unsqueeze(0).nonzero().unbind(1)
return x.nonzero().unbind(1)
else:
return x.nonzero(as_tuple=True)
@torch.jit.script_if_tracing
def move_device_like(src: torch.Tensor, dst: torch.Tensor) -> torch.Tensor:
"""
Tracing friendly way to cast tensor to another tensor's device. Device will be treated
as constant during tracing, scripting the casting process as whole can workaround this issue.
"""
return src.to(dst.device)
# Copyright (c) Facebook, Inc. and its affiliates.
"""
Model Zoo API for Detectron2: a collection of functions to create common model architectures
listed in `MODEL_ZOO.md <https://github.com/facebookresearch/detectron2/blob/main/MODEL_ZOO.md>`_,
and optionally load their pre-trained weights.
"""
from .model_zoo import get, get_config_file, get_checkpoint_url, get_config
__all__ = ["get_checkpoint_url", "get", "get_config_file", "get_config"]
MODEL:
META_ARCHITECTURE: "GeneralizedRCNN"
RPN:
PRE_NMS_TOPK_TEST: 6000
POST_NMS_TOPK_TEST: 1000
ROI_HEADS:
NAME: "Res5ROIHeads"
DATASETS:
TRAIN: ("coco_2017_train",)
TEST: ("coco_2017_val",)
SOLVER:
IMS_PER_BATCH: 16
BASE_LR: 0.02
STEPS: (60000, 80000)
MAX_ITER: 90000
INPUT:
MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
VERSION: 2
MODEL:
META_ARCHITECTURE: "GeneralizedRCNN"
RESNETS:
OUT_FEATURES: ["res5"]
RES5_DILATION: 2
RPN:
IN_FEATURES: ["res5"]
PRE_NMS_TOPK_TEST: 6000
POST_NMS_TOPK_TEST: 1000
ROI_HEADS:
NAME: "StandardROIHeads"
IN_FEATURES: ["res5"]
ROI_BOX_HEAD:
NAME: "FastRCNNConvFCHead"
NUM_FC: 2
POOLER_RESOLUTION: 7
ROI_MASK_HEAD:
NAME: "MaskRCNNConvUpsampleHead"
NUM_CONV: 4
POOLER_RESOLUTION: 14
DATASETS:
TRAIN: ("coco_2017_train",)
TEST: ("coco_2017_val",)
SOLVER:
IMS_PER_BATCH: 16
BASE_LR: 0.02
STEPS: (60000, 80000)
MAX_ITER: 90000
INPUT:
MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
VERSION: 2
MODEL:
META_ARCHITECTURE: "GeneralizedRCNN"
BACKBONE:
NAME: "build_resnet_fpn_backbone"
RESNETS:
OUT_FEATURES: ["res2", "res3", "res4", "res5"]
FPN:
IN_FEATURES: ["res2", "res3", "res4", "res5"]
ANCHOR_GENERATOR:
SIZES: [[32], [64], [128], [256], [512]] # One size for each in feature map
ASPECT_RATIOS: [[0.5, 1.0, 2.0]] # Three aspect ratios (same for all in feature maps)
RPN:
IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
PRE_NMS_TOPK_TRAIN: 2000 # Per FPN level
PRE_NMS_TOPK_TEST: 1000 # Per FPN level
# Detectron1 uses 2000 proposals per-batch,
# (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue)
# which is approximately 1000 proposals per-image since the default batch size for FPN is 2.
POST_NMS_TOPK_TRAIN: 1000
POST_NMS_TOPK_TEST: 1000
ROI_HEADS:
NAME: "StandardROIHeads"
IN_FEATURES: ["p2", "p3", "p4", "p5"]
ROI_BOX_HEAD:
NAME: "FastRCNNConvFCHead"
NUM_FC: 2
POOLER_RESOLUTION: 7
ROI_MASK_HEAD:
NAME: "MaskRCNNConvUpsampleHead"
NUM_CONV: 4
POOLER_RESOLUTION: 14
DATASETS:
TRAIN: ("coco_2017_train",)
TEST: ("coco_2017_val",)
SOLVER:
IMS_PER_BATCH: 16
BASE_LR: 0.02
STEPS: (60000, 80000)
MAX_ITER: 90000
INPUT:
MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
VERSION: 2
MODEL:
META_ARCHITECTURE: "RetinaNet"
BACKBONE:
NAME: "build_retinanet_resnet_fpn_backbone"
RESNETS:
OUT_FEATURES: ["res3", "res4", "res5"]
ANCHOR_GENERATOR:
SIZES: !!python/object/apply:eval ["[[x, x * 2**(1.0/3), x * 2**(2.0/3) ] for x in [32, 64, 128, 256, 512 ]]"]
FPN:
IN_FEATURES: ["res3", "res4", "res5"]
RETINANET:
IOU_THRESHOLDS: [0.4, 0.5]
IOU_LABELS: [0, -1, 1]
SMOOTH_L1_LOSS_BETA: 0.0
DATASETS:
TRAIN: ("coco_2017_train",)
TEST: ("coco_2017_val",)
SOLVER:
IMS_PER_BATCH: 16
BASE_LR: 0.01 # Note that RetinaNet uses a different default learning rate
STEPS: (60000, 80000)
MAX_ITER: 90000
INPUT:
MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
VERSION: 2
_BASE_: "../Base-RCNN-FPN.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
MASK_ON: False
LOAD_PROPOSALS: True
RESNETS:
DEPTH: 50
PROPOSAL_GENERATOR:
NAME: "PrecomputedProposals"
DATASETS:
TRAIN: ("coco_2017_train",)
PROPOSAL_FILES_TRAIN: ("detectron2://COCO-Detection/rpn_R_50_FPN_1x/137258492/coco_2017_train_box_proposals_21bc3a.pkl", )
TEST: ("coco_2017_val",)
PROPOSAL_FILES_TEST: ("detectron2://COCO-Detection/rpn_R_50_FPN_1x/137258492/coco_2017_val_box_proposals_ee0dad.pkl", )
DATALOADER:
# proposals are part of the dataset_dicts, and take a lot of RAM
NUM_WORKERS: 2
_BASE_: "../Base-RCNN-C4.yaml"
MODEL:
WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl"
MASK_ON: False
RESNETS:
DEPTH: 101
SOLVER:
STEPS: (210000, 250000)
MAX_ITER: 270000
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment