boxes.py 12.1 KB
Newer Older
Aditya Oke's avatar
Aditya Oke committed
1
from typing import Tuple
2
3

import torch
4
import torchvision
5
from torch import Tensor
6
from torchvision.extension import _assert_has_ops
7

8
from ..utils import _log_api_usage_once
9
10
from ._box_convert import _box_cxcywh_to_xyxy, _box_xyxy_to_cxcywh, _box_xywh_to_xyxy, _box_xyxy_to_xywh

11

12
def nms(boxes: Tensor, scores: Tensor, iou_threshold: float) -> Tensor:
13
14
15
16
17
18
19
20
    """
    Performs non-maximum suppression (NMS) on the boxes according
    to their intersection-over-union (IoU).

    NMS iteratively removes lower scoring boxes which have an
    IoU greater than iou_threshold with another (higher scoring)
    box.

Francisco Massa's avatar
Francisco Massa committed
21
22
23
    If multiple boxes have the exact same score and satisfy the IoU
    criterion with respect to a reference box, the selected box is
    not guaranteed to be the same between CPU and GPU. This is similar
24
25
    to the behavior of argsort in PyTorch when repeated values are present.

26
27
    Args:
        boxes (Tensor[N, 4])): boxes to perform NMS on. They
28
29
            are expected to be in ``(x1, y1, x2, y2)`` format with ``0 <= x1 < x2`` and
            ``0 <= y1 < y2``.
30
31
        scores (Tensor[N]): scores for each one of the boxes
        iou_threshold (float): discards all overlapping boxes with IoU > iou_threshold
32

33
    Returns:
34
35
        Tensor: int64 tensor with the indices of the elements that have been kept
        by NMS, sorted in decreasing order of scores
36
    """
37
    _log_api_usage_once("torchvision.ops.nms")
38
    _assert_has_ops()
39
    return torch.ops.torchvision.nms(boxes, scores, iou_threshold)
40
41


42
43
44
45
46
47
def batched_nms(
    boxes: Tensor,
    scores: Tensor,
    idxs: Tensor,
    iou_threshold: float,
) -> Tensor:
48
49
50
51
52
53
    """
    Performs non-maximum suppression in a batched fashion.

    Each index value correspond to a category, and NMS
    will not be applied between elements of different categories.

54
55
    Args:
        boxes (Tensor[N, 4]): boxes where NMS will be performed. They
56
57
            are expected to be in ``(x1, y1, x2, y2)`` format with ``0 <= x1 < x2`` and
            ``0 <= y1 < y2``.
58
59
60
        scores (Tensor[N]): scores for each one of the boxes
        idxs (Tensor[N]): indices of the categories for each one of the boxes.
        iou_threshold (float): discards all overlapping boxes with IoU > iou_threshold
61

62
    Returns:
63
64
        Tensor: int64 tensor with the indices of the elements that have been kept by NMS, sorted
        in decreasing order of scores
65
    """
66
    _log_api_usage_once("torchvision.ops.batched_nms")
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
    # Benchmarks that drove the following thresholds are at
    # https://github.com/pytorch/vision/issues/1311#issuecomment-781329339
    # Ideally for GPU we'd use a higher threshold
    if boxes.numel() > 4_000 and not torchvision._is_tracing():
        return _batched_nms_vanilla(boxes, scores, idxs, iou_threshold)
    else:
        return _batched_nms_coordinate_trick(boxes, scores, idxs, iou_threshold)


@torch.jit._script_if_tracing
def _batched_nms_coordinate_trick(
    boxes: Tensor,
    scores: Tensor,
    idxs: Tensor,
    iou_threshold: float,
) -> Tensor:
    # strategy: in order to perform NMS independently per class,
84
85
86
    # we add an offset to all the boxes. The offset is dependent
    # only on the class idx, and is large enough so that boxes
    # from different classes do not overlap
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
    if boxes.numel() == 0:
        return torch.empty((0,), dtype=torch.int64, device=boxes.device)
    max_coordinate = boxes.max()
    offsets = idxs.to(boxes) * (max_coordinate + torch.tensor(1).to(boxes))
    boxes_for_nms = boxes + offsets[:, None]
    keep = nms(boxes_for_nms, scores, iou_threshold)
    return keep


@torch.jit._script_if_tracing
def _batched_nms_vanilla(
    boxes: Tensor,
    scores: Tensor,
    idxs: Tensor,
    iou_threshold: float,
) -> Tensor:
    # Based on Detectron2 implementation, just manually call nms() on each class independently
    keep_mask = torch.zeros_like(scores, dtype=torch.bool)
    for class_id in torch.unique(idxs):
        curr_indices = torch.where(idxs == class_id)[0]
        curr_keep_indices = nms(boxes[curr_indices], scores[curr_indices], iou_threshold)
        keep_mask[curr_indices[curr_keep_indices]] = True
    keep_indices = torch.where(keep_mask)[0]
    return keep_indices[scores[keep_indices].sort(descending=True)[1]]
111
112


113
def remove_small_boxes(boxes: Tensor, min_size: float) -> Tensor:
114
115
116
    """
    Remove boxes which contains at least one side smaller than min_size.

117
    Args:
118
119
        boxes (Tensor[N, 4]): boxes in ``(x1, y1, x2, y2)`` format
            with ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
eellison's avatar
eellison committed
120
        min_size (float): minimum size
121
122

    Returns:
123
124
        Tensor[K]: indices of the boxes that have both sides
        larger than min_size
125
    """
126
    _log_api_usage_once("torchvision.ops.remove_small_boxes")
127
128
    ws, hs = boxes[:, 2] - boxes[:, 0], boxes[:, 3] - boxes[:, 1]
    keep = (ws >= min_size) & (hs >= min_size)
129
    keep = torch.where(keep)[0]
130
131
132
    return keep


133
def clip_boxes_to_image(boxes: Tensor, size: Tuple[int, int]) -> Tensor:
134
    """
135
136
    Clip boxes so that they lie inside an image of size `size`.

137
    Args:
138
139
        boxes (Tensor[N, 4]): boxes in ``(x1, y1, x2, y2)`` format
            with ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
140
        size (Tuple[height, width]): size of the image
141
142

    Returns:
143
        Tensor[N, 4]: clipped boxes
144
    """
145
    _log_api_usage_once("torchvision.ops.clip_boxes_to_image")
146
147
148
149
    dim = boxes.dim()
    boxes_x = boxes[..., 0::2]
    boxes_y = boxes[..., 1::2]
    height, width = size
150
151
152
153
154
155
156
157
158
159

    if torchvision._is_tracing():
        boxes_x = torch.max(boxes_x, torch.tensor(0, dtype=boxes.dtype, device=boxes.device))
        boxes_x = torch.min(boxes_x, torch.tensor(width, dtype=boxes.dtype, device=boxes.device))
        boxes_y = torch.max(boxes_y, torch.tensor(0, dtype=boxes.dtype, device=boxes.device))
        boxes_y = torch.min(boxes_y, torch.tensor(height, dtype=boxes.dtype, device=boxes.device))
    else:
        boxes_x = boxes_x.clamp(min=0, max=width)
        boxes_y = boxes_y.clamp(min=0, max=height)

160
161
162
163
    clipped_boxes = torch.stack((boxes_x, boxes_y), dim=dim)
    return clipped_boxes.reshape(boxes.shape)


164
165
166
167
168
169
def box_convert(boxes: Tensor, in_fmt: str, out_fmt: str) -> Tensor:
    """
    Converts boxes from given in_fmt to out_fmt.
    Supported in_fmt and out_fmt are:

    'xyxy': boxes are represented via corners, x1, y1 being top left and x2, y2 being bottom right.
170
    This is the format that torchvision utilities expect.
171
172
173
174
175
176

    'xywh' : boxes are represented via corner, width and height, x1, y2 being top left, w, h being width and height.

    'cxcywh' : boxes are represented via centre, width and height, cx, cy being center of box, w, h
    being width and height.

177
    Args:
178
179
180
181
182
        boxes (Tensor[N, 4]): boxes which will be converted.
        in_fmt (str): Input format of given boxes. Supported formats are ['xyxy', 'xywh', 'cxcywh'].
        out_fmt (str): Output format of given boxes. Supported formats are ['xyxy', 'xywh', 'cxcywh']

    Returns:
183
        Tensor[N, 4]: Boxes into converted format.
184
    """
185

186
    _log_api_usage_once("torchvision.ops.box_convert")
187
    allowed_fmts = ("xyxy", "xywh", "cxcywh")
188
189
    if in_fmt not in allowed_fmts or out_fmt not in allowed_fmts:
        raise ValueError("Unsupported Bounding Box Conversions for given in_fmt and out_fmt")
190
191

    if in_fmt == out_fmt:
192
        return boxes.clone()
193

194
    if in_fmt != "xyxy" and out_fmt != "xyxy":
195
        # convert to xyxy and change in_fmt xyxy
196
        if in_fmt == "xywh":
197
            boxes = _box_xywh_to_xyxy(boxes)
198
        elif in_fmt == "cxcywh":
199
            boxes = _box_cxcywh_to_xyxy(boxes)
200
        in_fmt = "xyxy"
201
202
203
204
205
206
207
208
209
210
211
212

    if in_fmt == "xyxy":
        if out_fmt == "xywh":
            boxes = _box_xyxy_to_xywh(boxes)
        elif out_fmt == "cxcywh":
            boxes = _box_xyxy_to_cxcywh(boxes)
    elif out_fmt == "xyxy":
        if in_fmt == "xywh":
            boxes = _box_xywh_to_xyxy(boxes)
        elif in_fmt == "cxcywh":
            boxes = _box_cxcywh_to_xyxy(boxes)
    return boxes
213
214


215
216
217
218
219
220
221
222
def _upcast(t: Tensor) -> Tensor:
    # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type
    if t.is_floating_point():
        return t if t.dtype in (torch.float32, torch.float64) else t.float()
    else:
        return t if t.dtype in (torch.int32, torch.int64) else t.int()


223
def box_area(boxes: Tensor) -> Tensor:
224
    """
225
    Computes the area of a set of bounding boxes, which are specified by their
226
    (x1, y1, x2, y2) coordinates.
227

228
    Args:
229
        boxes (Tensor[N, 4]): boxes for which the area will be computed. They
230
231
            are expected to be in (x1, y1, x2, y2) format with
            ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
232
233

    Returns:
234
        Tensor[N]: the area for each box
235
    """
236
    _log_api_usage_once("torchvision.ops.box_area")
237
    boxes = _upcast(boxes)
238
239
240
241
242
    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])


# implementation from https://github.com/kuangliu/torchcv/blob/master/torchcv/utils/box.py
# with slight modifications
243
244
245
246
247
248
249
def _box_inter_union(boxes1: Tensor, boxes2: Tensor) -> Tuple[Tensor, Tensor]:
    area1 = box_area(boxes1)
    area2 = box_area(boxes2)

    lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
    rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]

250
    wh = _upcast(rb - lt).clamp(min=0)  # [N,M,2]
251
252
253
254
255
256
257
    inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]

    union = area1[:, None] + area2 - inter

    return inter, union


258
def box_iou(boxes1: Tensor, boxes2: Tensor) -> Tensor:
259
    """
260
    Return intersection-over-union (Jaccard index) between two sets of boxes.
261

262
263
    Both sets of boxes are expected to be in ``(x1, y1, x2, y2)`` format with
    ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
264

265
    Args:
266
267
        boxes1 (Tensor[N, 4]): first set of boxes
        boxes2 (Tensor[M, 4]): second set of boxes
268
269

    Returns:
270
        Tensor[N, M]: the NxM matrix containing the pairwise IoU values for every element in boxes1 and boxes2
271
    """
272
    _log_api_usage_once("torchvision.ops.box_iou")
273
274
    inter, union = _box_inter_union(boxes1, boxes2)
    iou = inter / union
275
    return iou
Aditya Oke's avatar
Aditya Oke committed
276
277
278
279
280


# Implementation adapted from https://github.com/facebookresearch/detr/blob/master/util/box_ops.py
def generalized_box_iou(boxes1: Tensor, boxes2: Tensor) -> Tensor:
    """
281
    Return generalized intersection-over-union (Jaccard index) between two sets of boxes.
Aditya Oke's avatar
Aditya Oke committed
282

283
284
    Both sets of boxes are expected to be in ``(x1, y1, x2, y2)`` format with
    ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
Aditya Oke's avatar
Aditya Oke committed
285

286
    Args:
287
288
        boxes1 (Tensor[N, 4]): first set of boxes
        boxes2 (Tensor[M, 4]): second set of boxes
Aditya Oke's avatar
Aditya Oke committed
289
290

    Returns:
291
        Tensor[N, M]: the NxM matrix containing the pairwise generalized IoU values
Aditya Oke's avatar
Aditya Oke committed
292
293
294
        for every element in boxes1 and boxes2
    """

295
    _log_api_usage_once("torchvision.ops.generalized_box_iou")
Aditya Oke's avatar
Aditya Oke committed
296
297
298
299
300
    # degenerate boxes gives inf / nan results
    # so do an early check
    assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
    assert (boxes2[:, 2:] >= boxes2[:, :2]).all()

301
    inter, union = _box_inter_union(boxes1, boxes2)
Aditya Oke's avatar
Aditya Oke committed
302
303
304
305
306
    iou = inter / union

    lti = torch.min(boxes1[:, None, :2], boxes2[:, :2])
    rbi = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])

307
    whi = _upcast(rbi - lti).clamp(min=0)  # [N,M,2]
Aditya Oke's avatar
Aditya Oke committed
308
309
310
    areai = whi[:, :, 0] * whi[:, :, 1]

    return iou - (areai - union) / areai
311
312
313
314


def masks_to_boxes(masks: torch.Tensor) -> torch.Tensor:
    """
315
    Compute the bounding boxes around the provided masks.
316

317
    Returns a [N, 4] tensor containing bounding boxes. The boxes are in ``(x1, y1, x2, y2)`` format with
318
319
320
    ``0 <= x1 < x2`` and ``0 <= y1 < y2``.

    Args:
321
322
        masks (Tensor[N, H, W]): masks to transform where N is the number of masks
            and (H, W) are the spatial dimensions.
323
324
325
326

    Returns:
        Tensor[N, 4]: bounding boxes
    """
327
    _log_api_usage_once("torchvision.ops.masks_to_boxes")
328
    if masks.numel() == 0:
329
        return torch.zeros((0, 4), device=masks.device, dtype=torch.float)
330
331
332

    n = masks.shape[0]

333
    bounding_boxes = torch.zeros((n, 4), device=masks.device, dtype=torch.float)
334
335

    for index, mask in enumerate(masks):
336
        y, x = torch.where(mask != 0)
337
338
339
340
341
342
343

        bounding_boxes[index, 0] = torch.min(x)
        bounding_boxes[index, 1] = torch.min(y)
        bounding_boxes[index, 2] = torch.max(x)
        bounding_boxes[index, 3] = torch.max(y)

    return bounding_boxes