boxes.py 16 KB
Newer Older
Aditya Oke's avatar
Aditya Oke committed
1
from typing import Tuple
2
3

import torch
4
import torchvision
5
from torch import Tensor
6
from torchvision.extension import _assert_has_ops
7

8
from ..utils import _log_api_usage_once
9
from ._box_convert import _box_cxcywh_to_xyxy, _box_xywh_to_xyxy, _box_xyxy_to_cxcywh, _box_xyxy_to_xywh
Aditya Oke's avatar
Aditya Oke committed
10
from ._utils import _upcast
11

12

13
def nms(boxes: Tensor, scores: Tensor, iou_threshold: float) -> Tensor:
14
15
16
17
18
    """
    Performs non-maximum suppression (NMS) on the boxes according
    to their intersection-over-union (IoU).

    NMS iteratively removes lower scoring boxes which have an
19
    IoU greater than ``iou_threshold`` with another (higher scoring)
20
21
    box.

Francisco Massa's avatar
Francisco Massa committed
22
23
24
    If multiple boxes have the exact same score and satisfy the IoU
    criterion with respect to a reference box, the selected box is
    not guaranteed to be the same between CPU and GPU. This is similar
25
26
    to the behavior of argsort in PyTorch when repeated values are present.

27
28
    Args:
        boxes (Tensor[N, 4])): boxes to perform NMS on. They
29
30
            are expected to be in ``(x1, y1, x2, y2)`` format with ``0 <= x1 < x2`` and
            ``0 <= y1 < y2``.
31
32
        scores (Tensor[N]): scores for each one of the boxes
        iou_threshold (float): discards all overlapping boxes with IoU > iou_threshold
33

34
    Returns:
35
36
        Tensor: int64 tensor with the indices of the elements that have been kept
        by NMS, sorted in decreasing order of scores
37
    """
Kai Zhang's avatar
Kai Zhang committed
38
39
    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
        _log_api_usage_once(nms)
40
    _assert_has_ops()
41
    return torch.ops.torchvision.nms(boxes, scores, iou_threshold)
42
43


44
45
46
47
48
49
def batched_nms(
    boxes: Tensor,
    scores: Tensor,
    idxs: Tensor,
    iou_threshold: float,
) -> Tensor:
50
51
52
53
54
55
    """
    Performs non-maximum suppression in a batched fashion.

    Each index value correspond to a category, and NMS
    will not be applied between elements of different categories.

56
57
    Args:
        boxes (Tensor[N, 4]): boxes where NMS will be performed. They
58
59
            are expected to be in ``(x1, y1, x2, y2)`` format with ``0 <= x1 < x2`` and
            ``0 <= y1 < y2``.
60
61
62
        scores (Tensor[N]): scores for each one of the boxes
        idxs (Tensor[N]): indices of the categories for each one of the boxes.
        iou_threshold (float): discards all overlapping boxes with IoU > iou_threshold
63

64
    Returns:
65
66
        Tensor: int64 tensor with the indices of the elements that have been kept by NMS, sorted
        in decreasing order of scores
67
    """
Kai Zhang's avatar
Kai Zhang committed
68
69
    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
        _log_api_usage_once(batched_nms)
70
71
    # Benchmarks that drove the following thresholds are at
    # https://github.com/pytorch/vision/issues/1311#issuecomment-781329339
72
    if boxes.numel() > (4000 if boxes.device.type == "cpu" else 20000) and not torchvision._is_tracing():
73
74
75
76
77
78
79
80
81
82
83
84
85
        return _batched_nms_vanilla(boxes, scores, idxs, iou_threshold)
    else:
        return _batched_nms_coordinate_trick(boxes, scores, idxs, iou_threshold)


@torch.jit._script_if_tracing
def _batched_nms_coordinate_trick(
    boxes: Tensor,
    scores: Tensor,
    idxs: Tensor,
    iou_threshold: float,
) -> Tensor:
    # strategy: in order to perform NMS independently per class,
86
87
88
    # we add an offset to all the boxes. The offset is dependent
    # only on the class idx, and is large enough so that boxes
    # from different classes do not overlap
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
    if boxes.numel() == 0:
        return torch.empty((0,), dtype=torch.int64, device=boxes.device)
    max_coordinate = boxes.max()
    offsets = idxs.to(boxes) * (max_coordinate + torch.tensor(1).to(boxes))
    boxes_for_nms = boxes + offsets[:, None]
    keep = nms(boxes_for_nms, scores, iou_threshold)
    return keep


@torch.jit._script_if_tracing
def _batched_nms_vanilla(
    boxes: Tensor,
    scores: Tensor,
    idxs: Tensor,
    iou_threshold: float,
) -> Tensor:
    # Based on Detectron2 implementation, just manually call nms() on each class independently
    keep_mask = torch.zeros_like(scores, dtype=torch.bool)
    for class_id in torch.unique(idxs):
        curr_indices = torch.where(idxs == class_id)[0]
        curr_keep_indices = nms(boxes[curr_indices], scores[curr_indices], iou_threshold)
        keep_mask[curr_indices[curr_keep_indices]] = True
    keep_indices = torch.where(keep_mask)[0]
    return keep_indices[scores[keep_indices].sort(descending=True)[1]]
113
114


115
def remove_small_boxes(boxes: Tensor, min_size: float) -> Tensor:
116
    """
117
118
119
120
121
122
    Remove every box from ``boxes`` which contains at least one side length
    that is smaller than ``min_size``.

    .. note::
        For sanitizing a :class:`~torchvision.tv_tensors.BoundingBoxes` object, consider using
        the transform :func:`~torchvision.transforms.v2.SanitizeBoundingBoxes` instead.
123

124
    Args:
125
126
        boxes (Tensor[N, 4]): boxes in ``(x1, y1, x2, y2)`` format
            with ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
eellison's avatar
eellison committed
127
        min_size (float): minimum size
128
129

    Returns:
130
        Tensor[K]: indices of the boxes that have both sides
131
        larger than ``min_size``
132
    """
Kai Zhang's avatar
Kai Zhang committed
133
134
    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
        _log_api_usage_once(remove_small_boxes)
135
136
    ws, hs = boxes[:, 2] - boxes[:, 0], boxes[:, 3] - boxes[:, 1]
    keep = (ws >= min_size) & (hs >= min_size)
137
    keep = torch.where(keep)[0]
138
139
140
    return keep


141
def clip_boxes_to_image(boxes: Tensor, size: Tuple[int, int]) -> Tensor:
142
    """
143
144
145
146
147
    Clip boxes so that they lie inside an image of size ``size``.

    .. note::
        For clipping a :class:`~torchvision.tv_tensors.BoundingBoxes` object, consider using
        the transform :func:`~torchvision.transforms.v2.ClampBoundingBoxes` instead.
148

149
    Args:
150
151
        boxes (Tensor[N, 4]): boxes in ``(x1, y1, x2, y2)`` format
            with ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
152
        size (Tuple[height, width]): size of the image
153
154

    Returns:
155
        Tensor[N, 4]: clipped boxes
156
    """
Kai Zhang's avatar
Kai Zhang committed
157
158
    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
        _log_api_usage_once(clip_boxes_to_image)
159
160
161
162
    dim = boxes.dim()
    boxes_x = boxes[..., 0::2]
    boxes_y = boxes[..., 1::2]
    height, width = size
163
164
165
166
167
168
169
170
171
172

    if torchvision._is_tracing():
        boxes_x = torch.max(boxes_x, torch.tensor(0, dtype=boxes.dtype, device=boxes.device))
        boxes_x = torch.min(boxes_x, torch.tensor(width, dtype=boxes.dtype, device=boxes.device))
        boxes_y = torch.max(boxes_y, torch.tensor(0, dtype=boxes.dtype, device=boxes.device))
        boxes_y = torch.min(boxes_y, torch.tensor(height, dtype=boxes.dtype, device=boxes.device))
    else:
        boxes_x = boxes_x.clamp(min=0, max=width)
        boxes_y = boxes_y.clamp(min=0, max=height)

173
174
175
176
    clipped_boxes = torch.stack((boxes_x, boxes_y), dim=dim)
    return clipped_boxes.reshape(boxes.shape)


177
178
def box_convert(boxes: Tensor, in_fmt: str, out_fmt: str) -> Tensor:
    """
179
180
181
182
183
184
185
186
187
    Converts :class:`torch.Tensor` boxes from a given ``in_fmt`` to ``out_fmt``.

    .. note::
        For converting a :class:`torch.Tensor` or a :class:`~torchvision.tv_tensors.BoundingBoxes` object
        between different formats,
        consider using :func:`~torchvision.transforms.v2.functional.convert_bounding_box_format` instead.
        Or see the corresponding transform :func:`~torchvision.transforms.v2.ConvertBoundingBoxFormat`.

    Supported ``in_fmt`` and ``out_fmt`` strings are:
188

189
    ``'xyxy'``: boxes are represented via corners, x1, y1 being top left and x2, y2 being bottom right.
190
    This is the format that torchvision utilities expect.
191

192
    ``'xywh'``: boxes are represented via corner, width and height, x1, y2 being top left, w, h being width and height.
193

194
    ``'cxcywh'``: boxes are represented via centre, width and height, cx, cy being center of box, w, h
195
196
    being width and height.

197
    Args:
198
199
200
201
202
        boxes (Tensor[N, 4]): boxes which will be converted.
        in_fmt (str): Input format of given boxes. Supported formats are ['xyxy', 'xywh', 'cxcywh'].
        out_fmt (str): Output format of given boxes. Supported formats are ['xyxy', 'xywh', 'cxcywh']

    Returns:
203
        Tensor[N, 4]: Boxes into converted format.
204
    """
Kai Zhang's avatar
Kai Zhang committed
205
206
    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
        _log_api_usage_once(box_convert)
207
    allowed_fmts = ("xyxy", "xywh", "cxcywh")
208
209
    if in_fmt not in allowed_fmts or out_fmt not in allowed_fmts:
        raise ValueError("Unsupported Bounding Box Conversions for given in_fmt and out_fmt")
210
211

    if in_fmt == out_fmt:
212
        return boxes.clone()
213

214
    if in_fmt != "xyxy" and out_fmt != "xyxy":
215
        # convert to xyxy and change in_fmt xyxy
216
        if in_fmt == "xywh":
217
            boxes = _box_xywh_to_xyxy(boxes)
218
        elif in_fmt == "cxcywh":
219
            boxes = _box_cxcywh_to_xyxy(boxes)
220
        in_fmt = "xyxy"
221
222
223
224
225
226
227
228
229
230
231
232

    if in_fmt == "xyxy":
        if out_fmt == "xywh":
            boxes = _box_xyxy_to_xywh(boxes)
        elif out_fmt == "cxcywh":
            boxes = _box_xyxy_to_cxcywh(boxes)
    elif out_fmt == "xyxy":
        if in_fmt == "xywh":
            boxes = _box_xywh_to_xyxy(boxes)
        elif in_fmt == "cxcywh":
            boxes = _box_cxcywh_to_xyxy(boxes)
    return boxes
233
234


235
def box_area(boxes: Tensor) -> Tensor:
236
    """
237
    Computes the area of a set of bounding boxes, which are specified by their
238
    (x1, y1, x2, y2) coordinates.
239

240
    Args:
241
        boxes (Tensor[N, 4]): boxes for which the area will be computed. They
242
243
            are expected to be in (x1, y1, x2, y2) format with
            ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
244
245

    Returns:
246
        Tensor[N]: the area for each box
247
    """
Kai Zhang's avatar
Kai Zhang committed
248
249
    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
        _log_api_usage_once(box_area)
250
    boxes = _upcast(boxes)
251
252
253
254
255
    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])


# implementation from https://github.com/kuangliu/torchcv/blob/master/torchcv/utils/box.py
# with slight modifications
256
257
258
259
260
261
262
def _box_inter_union(boxes1: Tensor, boxes2: Tensor) -> Tuple[Tensor, Tensor]:
    area1 = box_area(boxes1)
    area2 = box_area(boxes2)

    lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
    rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]

263
    wh = _upcast(rb - lt).clamp(min=0)  # [N,M,2]
264
265
266
267
268
269
270
    inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]

    union = area1[:, None] + area2 - inter

    return inter, union


271
def box_iou(boxes1: Tensor, boxes2: Tensor) -> Tensor:
272
    """
273
    Return intersection-over-union (Jaccard index) between two sets of boxes.
274

275
276
    Both sets of boxes are expected to be in ``(x1, y1, x2, y2)`` format with
    ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
277

278
    Args:
279
280
        boxes1 (Tensor[N, 4]): first set of boxes
        boxes2 (Tensor[M, 4]): second set of boxes
281
282

    Returns:
283
        Tensor[N, M]: the NxM matrix containing the pairwise IoU values for every element in boxes1 and boxes2
284
    """
Kai Zhang's avatar
Kai Zhang committed
285
286
    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
        _log_api_usage_once(box_iou)
287
288
    inter, union = _box_inter_union(boxes1, boxes2)
    iou = inter / union
289
    return iou
Aditya Oke's avatar
Aditya Oke committed
290
291
292
293
294


# Implementation adapted from https://github.com/facebookresearch/detr/blob/master/util/box_ops.py
def generalized_box_iou(boxes1: Tensor, boxes2: Tensor) -> Tensor:
    """
295
    Return generalized intersection-over-union (Jaccard index) between two sets of boxes.
Aditya Oke's avatar
Aditya Oke committed
296

297
298
    Both sets of boxes are expected to be in ``(x1, y1, x2, y2)`` format with
    ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
Aditya Oke's avatar
Aditya Oke committed
299

300
    Args:
301
302
        boxes1 (Tensor[N, 4]): first set of boxes
        boxes2 (Tensor[M, 4]): second set of boxes
Aditya Oke's avatar
Aditya Oke committed
303
304

    Returns:
305
        Tensor[N, M]: the NxM matrix containing the pairwise generalized IoU values
Aditya Oke's avatar
Aditya Oke committed
306
307
        for every element in boxes1 and boxes2
    """
Kai Zhang's avatar
Kai Zhang committed
308
309
    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
        _log_api_usage_once(generalized_box_iou)
Aditya Oke's avatar
Aditya Oke committed
310

311
    inter, union = _box_inter_union(boxes1, boxes2)
Aditya Oke's avatar
Aditya Oke committed
312
313
314
315
316
    iou = inter / union

    lti = torch.min(boxes1[:, None, :2], boxes2[:, :2])
    rbi = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])

317
    whi = _upcast(rbi - lti).clamp(min=0)  # [N,M,2]
Aditya Oke's avatar
Aditya Oke committed
318
319
320
    areai = whi[:, :, 0] * whi[:, :, 1]

    return iou - (areai - union) / areai
321
322


Abhijit Deo's avatar
Abhijit Deo committed
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
def complete_box_iou(boxes1: Tensor, boxes2: Tensor, eps: float = 1e-7) -> Tensor:
    """
    Return complete intersection-over-union (Jaccard index) between two sets of boxes.
    Both sets of boxes are expected to be in ``(x1, y1, x2, y2)`` format with
    ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
    Args:
        boxes1 (Tensor[N, 4]): first set of boxes
        boxes2 (Tensor[M, 4]): second set of boxes
        eps (float, optional): small number to prevent division by zero. Default: 1e-7
    Returns:
        Tensor[N, M]: the NxM matrix containing the pairwise complete IoU values
        for every element in boxes1 and boxes2
    """
    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
        _log_api_usage_once(complete_box_iou)

    boxes1 = _upcast(boxes1)
    boxes2 = _upcast(boxes2)

Aditya Oke's avatar
Aditya Oke committed
342
    diou, iou = _box_diou_iou(boxes1, boxes2, eps)
Abhijit Deo's avatar
Abhijit Deo committed
343

344
345
    w_pred = boxes1[:, None, 2] - boxes1[:, None, 0]
    h_pred = boxes1[:, None, 3] - boxes1[:, None, 1]
Abhijit Deo's avatar
Abhijit Deo committed
346
347
348
349

    w_gt = boxes2[:, 2] - boxes2[:, 0]
    h_gt = boxes2[:, 3] - boxes2[:, 1]

350
    v = (4 / (torch.pi**2)) * torch.pow(torch.atan(w_pred / h_pred) - torch.atan(w_gt / h_gt), 2)
Abhijit Deo's avatar
Abhijit Deo committed
351
352
    with torch.no_grad():
        alpha = v / (1 - iou + v + eps)
Aditya Oke's avatar
Aditya Oke committed
353
    return diou - alpha * v
Abhijit Deo's avatar
Abhijit Deo committed
354
355


Yassine Alouini's avatar
Yassine Alouini committed
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
def distance_box_iou(boxes1: Tensor, boxes2: Tensor, eps: float = 1e-7) -> Tensor:
    """
    Return distance intersection-over-union (Jaccard index) between two sets of boxes.

    Both sets of boxes are expected to be in ``(x1, y1, x2, y2)`` format with
    ``0 <= x1 < x2`` and ``0 <= y1 < y2``.

    Args:
        boxes1 (Tensor[N, 4]): first set of boxes
        boxes2 (Tensor[M, 4]): second set of boxes
        eps (float, optional): small number to prevent division by zero. Default: 1e-7

    Returns:
        Tensor[N, M]: the NxM matrix containing the pairwise distance IoU values
        for every element in boxes1 and boxes2
    """
    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
        _log_api_usage_once(distance_box_iou)

    boxes1 = _upcast(boxes1)
    boxes2 = _upcast(boxes2)
377
    diou, _ = _box_diou_iou(boxes1, boxes2, eps=eps)
Aditya Oke's avatar
Aditya Oke committed
378
    return diou
Yassine Alouini's avatar
Yassine Alouini committed
379
380


Aditya Oke's avatar
Aditya Oke committed
381
382
383
def _box_diou_iou(boxes1: Tensor, boxes2: Tensor, eps: float = 1e-7) -> Tuple[Tensor, Tensor]:

    iou = box_iou(boxes1, boxes2)
Yassine Alouini's avatar
Yassine Alouini committed
384
385
386
387
388
389
390
391
392
393
    lti = torch.min(boxes1[:, None, :2], boxes2[:, :2])
    rbi = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
    whi = _upcast(rbi - lti).clamp(min=0)  # [N,M,2]
    diagonal_distance_squared = (whi[:, :, 0] ** 2) + (whi[:, :, 1] ** 2) + eps
    # centers of boxes
    x_p = (boxes1[:, 0] + boxes1[:, 2]) / 2
    y_p = (boxes1[:, 1] + boxes1[:, 3]) / 2
    x_g = (boxes2[:, 0] + boxes2[:, 2]) / 2
    y_g = (boxes2[:, 1] + boxes2[:, 3]) / 2
    # The distance between boxes' centers squared.
394
395
396
    centers_distance_squared = (_upcast((x_p[:, None] - x_g[None, :])) ** 2) + (
        _upcast((y_p[:, None] - y_g[None, :])) ** 2
    )
Yassine Alouini's avatar
Yassine Alouini committed
397
398
    # The distance IoU is the IoU penalized by a normalized
    # distance between boxes' centers squared.
Aditya Oke's avatar
Aditya Oke committed
399
    return iou - (centers_distance_squared / diagonal_distance_squared), iou
Yassine Alouini's avatar
Yassine Alouini committed
400
401


402
403
def masks_to_boxes(masks: torch.Tensor) -> torch.Tensor:
    """
404
    Compute the bounding boxes around the provided masks.
405

406
    Returns a [N, 4] tensor containing bounding boxes. The boxes are in ``(x1, y1, x2, y2)`` format with
407
408
409
    ``0 <= x1 < x2`` and ``0 <= y1 < y2``.

    Args:
410
411
        masks (Tensor[N, H, W]): masks to transform where N is the number of masks
            and (H, W) are the spatial dimensions.
412
413
414
415

    Returns:
        Tensor[N, 4]: bounding boxes
    """
Kai Zhang's avatar
Kai Zhang committed
416
417
    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
        _log_api_usage_once(masks_to_boxes)
418
    if masks.numel() == 0:
419
        return torch.zeros((0, 4), device=masks.device, dtype=torch.float)
420
421
422

    n = masks.shape[0]

423
    bounding_boxes = torch.zeros((n, 4), device=masks.device, dtype=torch.float)
424
425

    for index, mask in enumerate(masks):
426
        y, x = torch.where(mask != 0)
427
428
429
430
431
432
433

        bounding_boxes[index, 0] = torch.min(x)
        bounding_boxes[index, 1] = torch.min(y)
        bounding_boxes[index, 2] = torch.max(x)
        bounding_boxes[index, 3] = torch.max(y)

    return bounding_boxes