boxes.py 11.6 KB
Newer Older
Aditya Oke's avatar
Aditya Oke committed
1
from typing import Tuple
2
3

import torch
4
import torchvision
5
from torch import Tensor
6
from torchvision.extension import _assert_has_ops
7

8
9
from ._box_convert import _box_cxcywh_to_xyxy, _box_xyxy_to_cxcywh, _box_xywh_to_xyxy, _box_xyxy_to_xywh

10

11
def nms(boxes: Tensor, scores: Tensor, iou_threshold: float) -> Tensor:
12
13
14
15
16
17
18
19
    """
    Performs non-maximum suppression (NMS) on the boxes according
    to their intersection-over-union (IoU).

    NMS iteratively removes lower scoring boxes which have an
    IoU greater than iou_threshold with another (higher scoring)
    box.

Francisco Massa's avatar
Francisco Massa committed
20
21
22
    If multiple boxes have the exact same score and satisfy the IoU
    criterion with respect to a reference box, the selected box is
    not guaranteed to be the same between CPU and GPU. This is similar
23
24
    to the behavior of argsort in PyTorch when repeated values are present.

25
26
    Args:
        boxes (Tensor[N, 4])): boxes to perform NMS on. They
27
28
            are expected to be in ``(x1, y1, x2, y2)`` format with ``0 <= x1 < x2`` and
            ``0 <= y1 < y2``.
29
30
        scores (Tensor[N]): scores for each one of the boxes
        iou_threshold (float): discards all overlapping boxes with IoU > iou_threshold
31

32
    Returns:
33
34
        Tensor: int64 tensor with the indices of the elements that have been kept
        by NMS, sorted in decreasing order of scores
35
    """
36
    _assert_has_ops()
37
    return torch.ops.torchvision.nms(boxes, scores, iou_threshold)
38
39


40
41
42
43
44
45
def batched_nms(
    boxes: Tensor,
    scores: Tensor,
    idxs: Tensor,
    iou_threshold: float,
) -> Tensor:
46
47
48
49
50
51
    """
    Performs non-maximum suppression in a batched fashion.

    Each index value correspond to a category, and NMS
    will not be applied between elements of different categories.

52
53
    Args:
        boxes (Tensor[N, 4]): boxes where NMS will be performed. They
54
55
            are expected to be in ``(x1, y1, x2, y2)`` format with ``0 <= x1 < x2`` and
            ``0 <= y1 < y2``.
56
57
58
        scores (Tensor[N]): scores for each one of the boxes
        idxs (Tensor[N]): indices of the categories for each one of the boxes.
        iou_threshold (float): discards all overlapping boxes with IoU > iou_threshold
59

60
    Returns:
61
62
        Tensor: int64 tensor with the indices of the elements that have been kept by NMS, sorted
        in decreasing order of scores
63
    """
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
    # Benchmarks that drove the following thresholds are at
    # https://github.com/pytorch/vision/issues/1311#issuecomment-781329339
    # Ideally for GPU we'd use a higher threshold
    if boxes.numel() > 4_000 and not torchvision._is_tracing():
        return _batched_nms_vanilla(boxes, scores, idxs, iou_threshold)
    else:
        return _batched_nms_coordinate_trick(boxes, scores, idxs, iou_threshold)


@torch.jit._script_if_tracing
def _batched_nms_coordinate_trick(
    boxes: Tensor,
    scores: Tensor,
    idxs: Tensor,
    iou_threshold: float,
) -> Tensor:
    # strategy: in order to perform NMS independently per class,
81
82
83
    # we add an offset to all the boxes. The offset is dependent
    # only on the class idx, and is large enough so that boxes
    # from different classes do not overlap
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
    if boxes.numel() == 0:
        return torch.empty((0,), dtype=torch.int64, device=boxes.device)
    max_coordinate = boxes.max()
    offsets = idxs.to(boxes) * (max_coordinate + torch.tensor(1).to(boxes))
    boxes_for_nms = boxes + offsets[:, None]
    keep = nms(boxes_for_nms, scores, iou_threshold)
    return keep


@torch.jit._script_if_tracing
def _batched_nms_vanilla(
    boxes: Tensor,
    scores: Tensor,
    idxs: Tensor,
    iou_threshold: float,
) -> Tensor:
    # Based on Detectron2 implementation, just manually call nms() on each class independently
    keep_mask = torch.zeros_like(scores, dtype=torch.bool)
    for class_id in torch.unique(idxs):
        curr_indices = torch.where(idxs == class_id)[0]
        curr_keep_indices = nms(boxes[curr_indices], scores[curr_indices], iou_threshold)
        keep_mask[curr_indices[curr_keep_indices]] = True
    keep_indices = torch.where(keep_mask)[0]
    return keep_indices[scores[keep_indices].sort(descending=True)[1]]
108
109


110
def remove_small_boxes(boxes: Tensor, min_size: float) -> Tensor:
111
112
113
    """
    Remove boxes which contains at least one side smaller than min_size.

114
    Args:
115
116
        boxes (Tensor[N, 4]): boxes in ``(x1, y1, x2, y2)`` format
            with ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
eellison's avatar
eellison committed
117
        min_size (float): minimum size
118
119

    Returns:
120
121
        Tensor[K]: indices of the boxes that have both sides
        larger than min_size
122
    """
123
124
    ws, hs = boxes[:, 2] - boxes[:, 0], boxes[:, 3] - boxes[:, 1]
    keep = (ws >= min_size) & (hs >= min_size)
125
    keep = torch.where(keep)[0]
126
127
128
    return keep


129
def clip_boxes_to_image(boxes: Tensor, size: Tuple[int, int]) -> Tensor:
130
    """
131
132
    Clip boxes so that they lie inside an image of size `size`.

133
    Args:
134
135
        boxes (Tensor[N, 4]): boxes in ``(x1, y1, x2, y2)`` format
            with ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
136
        size (Tuple[height, width]): size of the image
137
138

    Returns:
139
        Tensor[N, 4]: clipped boxes
140
141
142
143
144
    """
    dim = boxes.dim()
    boxes_x = boxes[..., 0::2]
    boxes_y = boxes[..., 1::2]
    height, width = size
145
146
147
148
149
150
151
152
153
154

    if torchvision._is_tracing():
        boxes_x = torch.max(boxes_x, torch.tensor(0, dtype=boxes.dtype, device=boxes.device))
        boxes_x = torch.min(boxes_x, torch.tensor(width, dtype=boxes.dtype, device=boxes.device))
        boxes_y = torch.max(boxes_y, torch.tensor(0, dtype=boxes.dtype, device=boxes.device))
        boxes_y = torch.min(boxes_y, torch.tensor(height, dtype=boxes.dtype, device=boxes.device))
    else:
        boxes_x = boxes_x.clamp(min=0, max=width)
        boxes_y = boxes_y.clamp(min=0, max=height)

155
156
157
158
    clipped_boxes = torch.stack((boxes_x, boxes_y), dim=dim)
    return clipped_boxes.reshape(boxes.shape)


159
160
161
162
163
164
def box_convert(boxes: Tensor, in_fmt: str, out_fmt: str) -> Tensor:
    """
    Converts boxes from given in_fmt to out_fmt.
    Supported in_fmt and out_fmt are:

    'xyxy': boxes are represented via corners, x1, y1 being top left and x2, y2 being bottom right.
165
    This is the format that torchvision utilities expect.
166
167
168
169
170
171

    'xywh' : boxes are represented via corner, width and height, x1, y2 being top left, w, h being width and height.

    'cxcywh' : boxes are represented via centre, width and height, cx, cy being center of box, w, h
    being width and height.

172
    Args:
173
174
175
176
177
        boxes (Tensor[N, 4]): boxes which will be converted.
        in_fmt (str): Input format of given boxes. Supported formats are ['xyxy', 'xywh', 'cxcywh'].
        out_fmt (str): Output format of given boxes. Supported formats are ['xyxy', 'xywh', 'cxcywh']

    Returns:
178
        Tensor[N, 4]: Boxes into converted format.
179
    """
180

181
    allowed_fmts = ("xyxy", "xywh", "cxcywh")
182
183
    if in_fmt not in allowed_fmts or out_fmt not in allowed_fmts:
        raise ValueError("Unsupported Bounding Box Conversions for given in_fmt and out_fmt")
184
185

    if in_fmt == out_fmt:
186
        return boxes.clone()
187

188
    if in_fmt != "xyxy" and out_fmt != "xyxy":
189
        # convert to xyxy and change in_fmt xyxy
190
        if in_fmt == "xywh":
191
            boxes = _box_xywh_to_xyxy(boxes)
192
        elif in_fmt == "cxcywh":
193
            boxes = _box_cxcywh_to_xyxy(boxes)
194
        in_fmt = "xyxy"
195
196
197
198
199
200
201
202
203
204
205
206

    if in_fmt == "xyxy":
        if out_fmt == "xywh":
            boxes = _box_xyxy_to_xywh(boxes)
        elif out_fmt == "cxcywh":
            boxes = _box_xyxy_to_cxcywh(boxes)
    elif out_fmt == "xyxy":
        if in_fmt == "xywh":
            boxes = _box_xywh_to_xyxy(boxes)
        elif in_fmt == "cxcywh":
            boxes = _box_cxcywh_to_xyxy(boxes)
    return boxes
207
208


209
210
211
212
213
214
215
216
def _upcast(t: Tensor) -> Tensor:
    # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type
    if t.is_floating_point():
        return t if t.dtype in (torch.float32, torch.float64) else t.float()
    else:
        return t if t.dtype in (torch.int32, torch.int64) else t.int()


217
def box_area(boxes: Tensor) -> Tensor:
218
    """
219
    Computes the area of a set of bounding boxes, which are specified by their
220
    (x1, y1, x2, y2) coordinates.
221

222
    Args:
223
        boxes (Tensor[N, 4]): boxes for which the area will be computed. They
224
225
            are expected to be in (x1, y1, x2, y2) format with
            ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
226
227

    Returns:
228
        Tensor[N]: the area for each box
229
    """
230
    boxes = _upcast(boxes)
231
232
233
234
235
    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])


# implementation from https://github.com/kuangliu/torchcv/blob/master/torchcv/utils/box.py
# with slight modifications
236
237
238
239
240
241
242
def _box_inter_union(boxes1: Tensor, boxes2: Tensor) -> Tuple[Tensor, Tensor]:
    area1 = box_area(boxes1)
    area2 = box_area(boxes2)

    lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
    rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]

243
    wh = _upcast(rb - lt).clamp(min=0)  # [N,M,2]
244
245
246
247
248
249
250
    inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]

    union = area1[:, None] + area2 - inter

    return inter, union


251
def box_iou(boxes1: Tensor, boxes2: Tensor) -> Tensor:
252
    """
253
    Return intersection-over-union (Jaccard index) between two sets of boxes.
254

255
256
    Both sets of boxes are expected to be in ``(x1, y1, x2, y2)`` format with
    ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
257

258
    Args:
259
260
        boxes1 (Tensor[N, 4]): first set of boxes
        boxes2 (Tensor[M, 4]): second set of boxes
261
262

    Returns:
263
        Tensor[N, M]: the NxM matrix containing the pairwise IoU values for every element in boxes1 and boxes2
264
    """
265
266
    inter, union = _box_inter_union(boxes1, boxes2)
    iou = inter / union
267
    return iou
Aditya Oke's avatar
Aditya Oke committed
268
269
270
271
272


# Implementation adapted from https://github.com/facebookresearch/detr/blob/master/util/box_ops.py
def generalized_box_iou(boxes1: Tensor, boxes2: Tensor) -> Tensor:
    """
273
    Return generalized intersection-over-union (Jaccard index) between two sets of boxes.
Aditya Oke's avatar
Aditya Oke committed
274

275
276
    Both sets of boxes are expected to be in ``(x1, y1, x2, y2)`` format with
    ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
Aditya Oke's avatar
Aditya Oke committed
277

278
    Args:
279
280
        boxes1 (Tensor[N, 4]): first set of boxes
        boxes2 (Tensor[M, 4]): second set of boxes
Aditya Oke's avatar
Aditya Oke committed
281
282

    Returns:
283
        Tensor[N, M]: the NxM matrix containing the pairwise generalized IoU values
Aditya Oke's avatar
Aditya Oke committed
284
285
286
287
288
289
290
291
        for every element in boxes1 and boxes2
    """

    # degenerate boxes gives inf / nan results
    # so do an early check
    assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
    assert (boxes2[:, 2:] >= boxes2[:, :2]).all()

292
    inter, union = _box_inter_union(boxes1, boxes2)
Aditya Oke's avatar
Aditya Oke committed
293
294
295
296
297
    iou = inter / union

    lti = torch.min(boxes1[:, None, :2], boxes2[:, :2])
    rbi = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])

298
    whi = _upcast(rbi - lti).clamp(min=0)  # [N,M,2]
Aditya Oke's avatar
Aditya Oke committed
299
300
301
    areai = whi[:, :, 0] * whi[:, :, 1]

    return iou - (areai - union) / areai
302
303
304
305


def masks_to_boxes(masks: torch.Tensor) -> torch.Tensor:
    """
306
    Compute the bounding boxes around the provided masks.
307

308
    Returns a [N, 4] tensor containing bounding boxes. The boxes are in ``(x1, y1, x2, y2)`` format with
309
310
311
    ``0 <= x1 < x2`` and ``0 <= y1 < y2``.

    Args:
312
313
        masks (Tensor[N, H, W]): masks to transform where N is the number of masks
            and (H, W) are the spatial dimensions.
314
315
316
317
318

    Returns:
        Tensor[N, 4]: bounding boxes
    """
    if masks.numel() == 0:
319
        return torch.zeros((0, 4), device=masks.device, dtype=torch.float)
320
321
322

    n = masks.shape[0]

323
    bounding_boxes = torch.zeros((n, 4), device=masks.device, dtype=torch.float)
324
325
326
327
328
329
330
331
332
333

    for index, mask in enumerate(masks):
        y, x = torch.where(masks[index] != 0)

        bounding_boxes[index, 0] = torch.min(x)
        bounding_boxes[index, 1] = torch.min(y)
        bounding_boxes[index, 2] = torch.max(x)
        bounding_boxes[index, 3] = torch.max(y)

    return bounding_boxes