boxes.py 15.1 KB
Newer Older
Aditya Oke's avatar
Aditya Oke committed
1
from typing import Tuple
2
3

import torch
4
import torchvision
5
from torch import Tensor
6
from torchvision.extension import _assert_has_ops
7

8
from ..utils import _log_api_usage_once
9
from ._box_convert import _box_cxcywh_to_xyxy, _box_xywh_to_xyxy, _box_xyxy_to_cxcywh, _box_xyxy_to_xywh
Aditya Oke's avatar
Aditya Oke committed
10
from ._utils import _upcast
11

12

13
def nms(boxes: Tensor, scores: Tensor, iou_threshold: float) -> Tensor:
14
15
16
17
18
19
20
21
    """
    Performs non-maximum suppression (NMS) on the boxes according
    to their intersection-over-union (IoU).

    NMS iteratively removes lower scoring boxes which have an
    IoU greater than iou_threshold with another (higher scoring)
    box.

Francisco Massa's avatar
Francisco Massa committed
22
23
24
    If multiple boxes have the exact same score and satisfy the IoU
    criterion with respect to a reference box, the selected box is
    not guaranteed to be the same between CPU and GPU. This is similar
25
26
    to the behavior of argsort in PyTorch when repeated values are present.

27
28
    Args:
        boxes (Tensor[N, 4])): boxes to perform NMS on. They
29
30
            are expected to be in ``(x1, y1, x2, y2)`` format with ``0 <= x1 < x2`` and
            ``0 <= y1 < y2``.
31
32
        scores (Tensor[N]): scores for each one of the boxes
        iou_threshold (float): discards all overlapping boxes with IoU > iou_threshold
33

34
    Returns:
35
36
        Tensor: int64 tensor with the indices of the elements that have been kept
        by NMS, sorted in decreasing order of scores
37
    """
Kai Zhang's avatar
Kai Zhang committed
38
39
    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
        _log_api_usage_once(nms)
40
    _assert_has_ops()
41
    return torch.ops.torchvision.nms(boxes, scores, iou_threshold)
42
43


44
45
46
47
48
49
def batched_nms(
    boxes: Tensor,
    scores: Tensor,
    idxs: Tensor,
    iou_threshold: float,
) -> Tensor:
50
51
52
53
54
55
    """
    Performs non-maximum suppression in a batched fashion.

    Each index value correspond to a category, and NMS
    will not be applied between elements of different categories.

56
57
    Args:
        boxes (Tensor[N, 4]): boxes where NMS will be performed. They
58
59
            are expected to be in ``(x1, y1, x2, y2)`` format with ``0 <= x1 < x2`` and
            ``0 <= y1 < y2``.
60
61
62
        scores (Tensor[N]): scores for each one of the boxes
        idxs (Tensor[N]): indices of the categories for each one of the boxes.
        iou_threshold (float): discards all overlapping boxes with IoU > iou_threshold
63

64
    Returns:
65
66
        Tensor: int64 tensor with the indices of the elements that have been kept by NMS, sorted
        in decreasing order of scores
67
    """
Kai Zhang's avatar
Kai Zhang committed
68
69
    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
        _log_api_usage_once(batched_nms)
70
71
    # Benchmarks that drove the following thresholds are at
    # https://github.com/pytorch/vision/issues/1311#issuecomment-781329339
72
    if boxes.numel() > (4000 if boxes.device.type == "cpu" else 20000) and not torchvision._is_tracing():
73
74
75
76
77
78
79
80
81
82
83
84
85
        return _batched_nms_vanilla(boxes, scores, idxs, iou_threshold)
    else:
        return _batched_nms_coordinate_trick(boxes, scores, idxs, iou_threshold)


@torch.jit._script_if_tracing
def _batched_nms_coordinate_trick(
    boxes: Tensor,
    scores: Tensor,
    idxs: Tensor,
    iou_threshold: float,
) -> Tensor:
    # strategy: in order to perform NMS independently per class,
86
87
88
    # we add an offset to all the boxes. The offset is dependent
    # only on the class idx, and is large enough so that boxes
    # from different classes do not overlap
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
    if boxes.numel() == 0:
        return torch.empty((0,), dtype=torch.int64, device=boxes.device)
    max_coordinate = boxes.max()
    offsets = idxs.to(boxes) * (max_coordinate + torch.tensor(1).to(boxes))
    boxes_for_nms = boxes + offsets[:, None]
    keep = nms(boxes_for_nms, scores, iou_threshold)
    return keep


@torch.jit._script_if_tracing
def _batched_nms_vanilla(
    boxes: Tensor,
    scores: Tensor,
    idxs: Tensor,
    iou_threshold: float,
) -> Tensor:
    # Based on Detectron2 implementation, just manually call nms() on each class independently
    keep_mask = torch.zeros_like(scores, dtype=torch.bool)
    for class_id in torch.unique(idxs):
        curr_indices = torch.where(idxs == class_id)[0]
        curr_keep_indices = nms(boxes[curr_indices], scores[curr_indices], iou_threshold)
        keep_mask[curr_indices[curr_keep_indices]] = True
    keep_indices = torch.where(keep_mask)[0]
    return keep_indices[scores[keep_indices].sort(descending=True)[1]]
113
114


115
def remove_small_boxes(boxes: Tensor, min_size: float) -> Tensor:
116
117
118
    """
    Remove boxes which contains at least one side smaller than min_size.

119
    Args:
120
121
        boxes (Tensor[N, 4]): boxes in ``(x1, y1, x2, y2)`` format
            with ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
eellison's avatar
eellison committed
122
        min_size (float): minimum size
123
124

    Returns:
125
126
        Tensor[K]: indices of the boxes that have both sides
        larger than min_size
127
    """
Kai Zhang's avatar
Kai Zhang committed
128
129
    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
        _log_api_usage_once(remove_small_boxes)
130
131
    ws, hs = boxes[:, 2] - boxes[:, 0], boxes[:, 3] - boxes[:, 1]
    keep = (ws >= min_size) & (hs >= min_size)
132
    keep = torch.where(keep)[0]
133
134
135
    return keep


136
def clip_boxes_to_image(boxes: Tensor, size: Tuple[int, int]) -> Tensor:
137
    """
138
139
    Clip boxes so that they lie inside an image of size `size`.

140
    Args:
141
142
        boxes (Tensor[N, 4]): boxes in ``(x1, y1, x2, y2)`` format
            with ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
143
        size (Tuple[height, width]): size of the image
144
145

    Returns:
146
        Tensor[N, 4]: clipped boxes
147
    """
Kai Zhang's avatar
Kai Zhang committed
148
149
    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
        _log_api_usage_once(clip_boxes_to_image)
150
151
152
153
    dim = boxes.dim()
    boxes_x = boxes[..., 0::2]
    boxes_y = boxes[..., 1::2]
    height, width = size
154
155
156
157
158
159
160
161
162
163

    if torchvision._is_tracing():
        boxes_x = torch.max(boxes_x, torch.tensor(0, dtype=boxes.dtype, device=boxes.device))
        boxes_x = torch.min(boxes_x, torch.tensor(width, dtype=boxes.dtype, device=boxes.device))
        boxes_y = torch.max(boxes_y, torch.tensor(0, dtype=boxes.dtype, device=boxes.device))
        boxes_y = torch.min(boxes_y, torch.tensor(height, dtype=boxes.dtype, device=boxes.device))
    else:
        boxes_x = boxes_x.clamp(min=0, max=width)
        boxes_y = boxes_y.clamp(min=0, max=height)

164
165
166
167
    clipped_boxes = torch.stack((boxes_x, boxes_y), dim=dim)
    return clipped_boxes.reshape(boxes.shape)


168
169
170
171
172
173
def box_convert(boxes: Tensor, in_fmt: str, out_fmt: str) -> Tensor:
    """
    Converts boxes from given in_fmt to out_fmt.
    Supported in_fmt and out_fmt are:

    'xyxy': boxes are represented via corners, x1, y1 being top left and x2, y2 being bottom right.
174
    This is the format that torchvision utilities expect.
175
176
177
178
179
180

    'xywh' : boxes are represented via corner, width and height, x1, y2 being top left, w, h being width and height.

    'cxcywh' : boxes are represented via centre, width and height, cx, cy being center of box, w, h
    being width and height.

181
    Args:
182
183
184
185
186
        boxes (Tensor[N, 4]): boxes which will be converted.
        in_fmt (str): Input format of given boxes. Supported formats are ['xyxy', 'xywh', 'cxcywh'].
        out_fmt (str): Output format of given boxes. Supported formats are ['xyxy', 'xywh', 'cxcywh']

    Returns:
187
        Tensor[N, 4]: Boxes into converted format.
188
    """
Kai Zhang's avatar
Kai Zhang committed
189
190
    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
        _log_api_usage_once(box_convert)
191
    allowed_fmts = ("xyxy", "xywh", "cxcywh")
192
193
    if in_fmt not in allowed_fmts or out_fmt not in allowed_fmts:
        raise ValueError("Unsupported Bounding Box Conversions for given in_fmt and out_fmt")
194
195

    if in_fmt == out_fmt:
196
        return boxes.clone()
197

198
    if in_fmt != "xyxy" and out_fmt != "xyxy":
199
        # convert to xyxy and change in_fmt xyxy
200
        if in_fmt == "xywh":
201
            boxes = _box_xywh_to_xyxy(boxes)
202
        elif in_fmt == "cxcywh":
203
            boxes = _box_cxcywh_to_xyxy(boxes)
204
        in_fmt = "xyxy"
205
206
207
208
209
210
211
212
213
214
215
216

    if in_fmt == "xyxy":
        if out_fmt == "xywh":
            boxes = _box_xyxy_to_xywh(boxes)
        elif out_fmt == "cxcywh":
            boxes = _box_xyxy_to_cxcywh(boxes)
    elif out_fmt == "xyxy":
        if in_fmt == "xywh":
            boxes = _box_xywh_to_xyxy(boxes)
        elif in_fmt == "cxcywh":
            boxes = _box_cxcywh_to_xyxy(boxes)
    return boxes
217
218


219
def box_area(boxes: Tensor) -> Tensor:
220
    """
221
    Computes the area of a set of bounding boxes, which are specified by their
222
    (x1, y1, x2, y2) coordinates.
223

224
    Args:
225
        boxes (Tensor[N, 4]): boxes for which the area will be computed. They
226
227
            are expected to be in (x1, y1, x2, y2) format with
            ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
228
229

    Returns:
230
        Tensor[N]: the area for each box
231
    """
Kai Zhang's avatar
Kai Zhang committed
232
233
    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
        _log_api_usage_once(box_area)
234
    boxes = _upcast(boxes)
235
236
237
238
239
    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])


# implementation from https://github.com/kuangliu/torchcv/blob/master/torchcv/utils/box.py
# with slight modifications
240
241
242
243
244
245
246
def _box_inter_union(boxes1: Tensor, boxes2: Tensor) -> Tuple[Tensor, Tensor]:
    area1 = box_area(boxes1)
    area2 = box_area(boxes2)

    lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
    rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]

247
    wh = _upcast(rb - lt).clamp(min=0)  # [N,M,2]
248
249
250
251
252
253
254
    inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]

    union = area1[:, None] + area2 - inter

    return inter, union


255
def box_iou(boxes1: Tensor, boxes2: Tensor) -> Tensor:
256
    """
257
    Return intersection-over-union (Jaccard index) between two sets of boxes.
258

259
260
    Both sets of boxes are expected to be in ``(x1, y1, x2, y2)`` format with
    ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
261

262
    Args:
263
264
        boxes1 (Tensor[N, 4]): first set of boxes
        boxes2 (Tensor[M, 4]): second set of boxes
265
266

    Returns:
267
        Tensor[N, M]: the NxM matrix containing the pairwise IoU values for every element in boxes1 and boxes2
268
    """
Kai Zhang's avatar
Kai Zhang committed
269
270
    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
        _log_api_usage_once(box_iou)
271
272
    inter, union = _box_inter_union(boxes1, boxes2)
    iou = inter / union
273
    return iou
Aditya Oke's avatar
Aditya Oke committed
274
275
276
277
278


# Implementation adapted from https://github.com/facebookresearch/detr/blob/master/util/box_ops.py
def generalized_box_iou(boxes1: Tensor, boxes2: Tensor) -> Tensor:
    """
279
    Return generalized intersection-over-union (Jaccard index) between two sets of boxes.
Aditya Oke's avatar
Aditya Oke committed
280

281
282
    Both sets of boxes are expected to be in ``(x1, y1, x2, y2)`` format with
    ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
Aditya Oke's avatar
Aditya Oke committed
283

284
    Args:
285
286
        boxes1 (Tensor[N, 4]): first set of boxes
        boxes2 (Tensor[M, 4]): second set of boxes
Aditya Oke's avatar
Aditya Oke committed
287
288

    Returns:
289
        Tensor[N, M]: the NxM matrix containing the pairwise generalized IoU values
Aditya Oke's avatar
Aditya Oke committed
290
291
        for every element in boxes1 and boxes2
    """
Kai Zhang's avatar
Kai Zhang committed
292
293
    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
        _log_api_usage_once(generalized_box_iou)
Aditya Oke's avatar
Aditya Oke committed
294

295
    inter, union = _box_inter_union(boxes1, boxes2)
Aditya Oke's avatar
Aditya Oke committed
296
297
298
299
300
    iou = inter / union

    lti = torch.min(boxes1[:, None, :2], boxes2[:, :2])
    rbi = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])

301
    whi = _upcast(rbi - lti).clamp(min=0)  # [N,M,2]
Aditya Oke's avatar
Aditya Oke committed
302
303
304
    areai = whi[:, :, 0] * whi[:, :, 1]

    return iou - (areai - union) / areai
305
306


Abhijit Deo's avatar
Abhijit Deo committed
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
def complete_box_iou(boxes1: Tensor, boxes2: Tensor, eps: float = 1e-7) -> Tensor:
    """
    Return complete intersection-over-union (Jaccard index) between two sets of boxes.
    Both sets of boxes are expected to be in ``(x1, y1, x2, y2)`` format with
    ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
    Args:
        boxes1 (Tensor[N, 4]): first set of boxes
        boxes2 (Tensor[M, 4]): second set of boxes
        eps (float, optional): small number to prevent division by zero. Default: 1e-7
    Returns:
        Tensor[N, M]: the NxM matrix containing the pairwise complete IoU values
        for every element in boxes1 and boxes2
    """
    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
        _log_api_usage_once(complete_box_iou)

    boxes1 = _upcast(boxes1)
    boxes2 = _upcast(boxes2)

Aditya Oke's avatar
Aditya Oke committed
326
    diou, iou = _box_diou_iou(boxes1, boxes2, eps)
Abhijit Deo's avatar
Abhijit Deo committed
327

328
329
    w_pred = boxes1[:, None, 2] - boxes1[:, None, 0]
    h_pred = boxes1[:, None, 3] - boxes1[:, None, 1]
Abhijit Deo's avatar
Abhijit Deo committed
330
331
332
333

    w_gt = boxes2[:, 2] - boxes2[:, 0]
    h_gt = boxes2[:, 3] - boxes2[:, 1]

334
    v = (4 / (torch.pi**2)) * torch.pow(torch.atan(w_pred / h_pred) - torch.atan(w_gt / h_gt), 2)
Abhijit Deo's avatar
Abhijit Deo committed
335
336
    with torch.no_grad():
        alpha = v / (1 - iou + v + eps)
Aditya Oke's avatar
Aditya Oke committed
337
    return diou - alpha * v
Abhijit Deo's avatar
Abhijit Deo committed
338
339


Yassine Alouini's avatar
Yassine Alouini committed
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
def distance_box_iou(boxes1: Tensor, boxes2: Tensor, eps: float = 1e-7) -> Tensor:
    """
    Return distance intersection-over-union (Jaccard index) between two sets of boxes.

    Both sets of boxes are expected to be in ``(x1, y1, x2, y2)`` format with
    ``0 <= x1 < x2`` and ``0 <= y1 < y2``.

    Args:
        boxes1 (Tensor[N, 4]): first set of boxes
        boxes2 (Tensor[M, 4]): second set of boxes
        eps (float, optional): small number to prevent division by zero. Default: 1e-7

    Returns:
        Tensor[N, M]: the NxM matrix containing the pairwise distance IoU values
        for every element in boxes1 and boxes2
    """
    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
        _log_api_usage_once(distance_box_iou)

    boxes1 = _upcast(boxes1)
    boxes2 = _upcast(boxes2)
361
    diou, _ = _box_diou_iou(boxes1, boxes2, eps=eps)
Aditya Oke's avatar
Aditya Oke committed
362
    return diou
Yassine Alouini's avatar
Yassine Alouini committed
363
364


Aditya Oke's avatar
Aditya Oke committed
365
366
367
def _box_diou_iou(boxes1: Tensor, boxes2: Tensor, eps: float = 1e-7) -> Tuple[Tensor, Tensor]:

    iou = box_iou(boxes1, boxes2)
Yassine Alouini's avatar
Yassine Alouini committed
368
369
370
371
372
373
374
375
376
377
    lti = torch.min(boxes1[:, None, :2], boxes2[:, :2])
    rbi = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
    whi = _upcast(rbi - lti).clamp(min=0)  # [N,M,2]
    diagonal_distance_squared = (whi[:, :, 0] ** 2) + (whi[:, :, 1] ** 2) + eps
    # centers of boxes
    x_p = (boxes1[:, 0] + boxes1[:, 2]) / 2
    y_p = (boxes1[:, 1] + boxes1[:, 3]) / 2
    x_g = (boxes2[:, 0] + boxes2[:, 2]) / 2
    y_g = (boxes2[:, 1] + boxes2[:, 3]) / 2
    # The distance between boxes' centers squared.
378
379
380
    centers_distance_squared = (_upcast((x_p[:, None] - x_g[None, :])) ** 2) + (
        _upcast((y_p[:, None] - y_g[None, :])) ** 2
    )
Yassine Alouini's avatar
Yassine Alouini committed
381
382
    # The distance IoU is the IoU penalized by a normalized
    # distance between boxes' centers squared.
Aditya Oke's avatar
Aditya Oke committed
383
    return iou - (centers_distance_squared / diagonal_distance_squared), iou
Yassine Alouini's avatar
Yassine Alouini committed
384
385


386
387
def masks_to_boxes(masks: torch.Tensor) -> torch.Tensor:
    """
388
    Compute the bounding boxes around the provided masks.
389

390
    Returns a [N, 4] tensor containing bounding boxes. The boxes are in ``(x1, y1, x2, y2)`` format with
391
392
393
    ``0 <= x1 < x2`` and ``0 <= y1 < y2``.

    Args:
394
395
        masks (Tensor[N, H, W]): masks to transform where N is the number of masks
            and (H, W) are the spatial dimensions.
396
397
398
399

    Returns:
        Tensor[N, 4]: bounding boxes
    """
Kai Zhang's avatar
Kai Zhang committed
400
401
    if not torch.jit.is_scripting() and not torch.jit.is_tracing():
        _log_api_usage_once(masks_to_boxes)
402
    if masks.numel() == 0:
403
        return torch.zeros((0, 4), device=masks.device, dtype=torch.float)
404
405
406

    n = masks.shape[0]

407
    bounding_boxes = torch.zeros((n, 4), device=masks.device, dtype=torch.float)
408
409

    for index, mask in enumerate(masks):
410
        y, x = torch.where(mask != 0)
411
412
413
414
415
416
417

        bounding_boxes[index, 0] = torch.min(x)
        bounding_boxes[index, 1] = torch.min(y)
        bounding_boxes[index, 2] = torch.max(x)
        bounding_boxes[index, 3] = torch.max(y)

    return bounding_boxes