updated all docstrings and code references for boxes to be consistent with the...

updated all docstrings and code references for boxes to be consistent with the scheme (x1, y1, x2, y2) (#1110)

updated all docstrings and code references for boxes to be consistent with the...
updated all docstrings and code references for boxes to be consistent with the scheme (x1, y1, x2, y2) (#1110)
95a87851 · Varun Agrawal · Francisco Massa · b7615843 · 95a87851 · 95a87851
Commit 95a87851 authored Jul 12, 2019 by Varun Agrawal Committed by Francisco Massa Jul 12, 2019
8 changed files
--- a/references/detection/coco_eval.py
+++ b/references/detection/coco_eval.py
@@ -286,10 +286,10 @@ def loadRes(self, resFile):
            s = ann['keypoints']
            x = s[0::3]
            y = s[1::3]
-            x0, x1, y0, y1 = np.min(x), np.max(x), np.min(y), np.max(y)
-            ann['area'] = (x1 - x0) * (y1 - y0)
+            x1, x2, y1, y2 = np.min(x), np.max(x), np.min(y), np.max(y)
+            ann['area'] = (x2 - x1) * (y2 - y1)
            ann['id'] = id + 1
-            ann['bbox'] = [x0, y0, x1 - x0, y1 - y0]
+            ann['bbox'] = [x1, y1, x2 - x1, y2 - y1]
    # print('DONE (t={:0.2f}s)'.format(time.time()- tic))

    res.dataset['annotations'] = anns

--- a/torchvision/models/detection/faster_rcnn.py
+++ b/torchvision/models/detection/faster_rcnn.py
@@ -32,7 +32,7 @@ class FasterRCNN(GeneralizedRCNN):

    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
    containing:
-        - boxes (FloatTensor[N, 4]): the ground-truth boxes in [x0, y0, x1, y1] format, with values
+        - boxes (FloatTensor[N, 4]): the ground-truth boxes in [x1, y1, x2, y2] format, with values
          between 0 and H and 0 and W
        - labels (Int64Tensor[N]): the class label for each ground-truth box

@@ -42,7 +42,7 @@ class FasterRCNN(GeneralizedRCNN):
    During inference, the model requires only the input tensors, and returns the post-processed
    predictions as a List[Dict[Tensor]], one for each input image. The fields of the Dict are as
    follows:
-        - boxes (FloatTensor[N, 4]): the predicted boxes in [x0, y0, x1, y1] format, with values between
+        - boxes (FloatTensor[N, 4]): the predicted boxes in [x1, y1, x2, y2] format, with values between
          0 and H and 0 and W
        - labels (Int64Tensor[N]): the predicted labels for each image
        - scores (Tensor[N]): the scores or each prediction
@@ -300,7 +300,7 @@ def fasterrcnn_resnet50_fpn(pretrained=False, progress=True,

    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
    containing:
-        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x0, y0, x1, y1]`` format, with values
+        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with values
          between ``0`` and ``H`` and ``0`` and ``W``
        - labels (``Int64Tensor[N]``): the class label for each ground-truth box

@@ -310,7 +310,7 @@ def fasterrcnn_resnet50_fpn(pretrained=False, progress=True,
    During inference, the model requires only the input tensors, and returns the post-processed
    predictions as a ``List[Dict[Tensor]]``, one for each input image. The fields of the ``Dict`` are as
    follows:
-        - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x0, y0, x1, y1]`` format, with values between
+        - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with values between
          ``0`` and ``H`` and ``0`` and ``W``
        - labels (``Int64Tensor[N]``): the predicted labels for each image
        - scores (``Tensor[N]``): the scores or each prediction

--- a/torchvision/models/detection/keypoint_rcnn.py
+++ b/torchvision/models/detection/keypoint_rcnn.py
@@ -26,7 +26,7 @@ class KeypointRCNN(FasterRCNN):

    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
    containing:
-        - boxes (FloatTensor[N, 4]): the ground-truth boxes in [x0, y0, x1, y1] format, with values
+        - boxes (FloatTensor[N, 4]): the ground-truth boxes in [x1, y1, x2, y2] format, with values
          between 0 and H and 0 and W
        - labels (Int64Tensor[N]): the class label for each ground-truth box
        - keypoints (FloatTensor[N, K, 3]): the K keypoints location for each of the N instances, in the
@@ -38,7 +38,7 @@ class KeypointRCNN(FasterRCNN):
    During inference, the model requires only the input tensors, and returns the post-processed
    predictions as a List[Dict[Tensor]], one for each input image. The fields of the Dict are as
    follows:
-        - boxes (FloatTensor[N, 4]): the predicted boxes in [x0, y0, x1, y1] format, with values between
+        - boxes (FloatTensor[N, 4]): the predicted boxes in [x1, y1, x2, y2] format, with values between
          0 and H and 0 and W
        - labels (Int64Tensor[N]): the predicted labels for each image
        - scores (Tensor[N]): the scores or each prediction
@@ -276,7 +276,7 @@ def keypointrcnn_resnet50_fpn(pretrained=False, progress=True,

    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
    containing:
-        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x0, y0, x1, y1]`` format, with values
+        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with values
          between ``0`` and ``H`` and ``0`` and ``W``
        - labels (``Int64Tensor[N]``): the class label for each ground-truth box
        - keypoints (``FloatTensor[N, K, 3]``): the ``K`` keypoints location for each of the ``N`` instances, in the
@@ -288,7 +288,7 @@ def keypointrcnn_resnet50_fpn(pretrained=False, progress=True,
    During inference, the model requires only the input tensors, and returns the post-processed
    predictions as a ``List[Dict[Tensor]]``, one for each input image. The fields of the ``Dict`` are as
    follows:
-        - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x0, y0, x1, y1]`` format, with values between
+        - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with values between
          ``0`` and ``H`` and ``0`` and ``W``
        - labels (``Int64Tensor[N]``): the predicted labels for each image
        - scores (``Tensor[N]``): the scores or each prediction

--- a/torchvision/models/detection/mask_rcnn.py
+++ b/torchvision/models/detection/mask_rcnn.py
@@ -28,7 +28,7 @@ class MaskRCNN(FasterRCNN):

    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
    containing:
-        - boxes (FloatTensor[N, 4]): the ground-truth boxes in [x0, y0, x1, y1] format, with values
+        - boxes (FloatTensor[N, 4]): the ground-truth boxes in [x1, y1, x2, y2] format, with values
          between 0 and H and 0 and W
        - labels (Int64Tensor[N]): the class label for each ground-truth box
        - masks (UInt8Tensor[N, 1, H, W]): the segmentation binary masks for each instance
@@ -39,7 +39,7 @@ class MaskRCNN(FasterRCNN):
    During inference, the model requires only the input tensors, and returns the post-processed
    predictions as a List[Dict[Tensor]], one for each input image. The fields of the Dict are as
    follows:
-        - boxes (FloatTensor[N, 4]): the predicted boxes in [x0, y0, x1, y1] format, with values between
+        - boxes (FloatTensor[N, 4]): the predicted boxes in [x1, y1, x2, y2] format, with values between
          0 and H and 0 and W
        - labels (Int64Tensor[N]): the predicted labels for each image
        - scores (Tensor[N]): the scores or each prediction
@@ -275,7 +275,7 @@ def maskrcnn_resnet50_fpn(pretrained=False, progress=True,

    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
    containing:
-        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x0, y0, x1, y1]`` format, with values
+        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with values
          between ``0`` and ``H`` and ``0`` and ``W``
        - labels (``Int64Tensor[N]``): the class label for each ground-truth box
        - masks (``UInt8Tensor[N, 1, H, W]``): the segmentation binary masks for each instance
@@ -286,7 +286,7 @@ def maskrcnn_resnet50_fpn(pretrained=False, progress=True,
    During inference, the model requires only the input tensors, and returns the post-processed
    predictions as a ``List[Dict[Tensor]]``, one for each input image. The fields of the ``Dict`` are as
    follows:
-        - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x0, y0, x1, y1]`` format, with values between
+        - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with values between
          ``0`` and ``H`` and ``0`` and ``W``
        - labels (``Int64Tensor[N]``): the predicted labels for each image
        - scores (``Tensor[N]``): the scores or each prediction

--- a/torchvision/ops/boxes.py
+++ b/torchvision/ops/boxes.py
@@ -12,7 +12,8 @@ def nms(boxes, scores, iou_threshold):
    box.

    Arguments:
-        boxes (Tensor[N, 4]): boxes to perform NMS on
+        boxes (Tensor[N, 4]): boxes to perform NMS on. They
+            are expected to be in (x1, y1, x2, y2) format
        scores (Tensor[N]): scores for each one of the boxes
        iou_threshold (float): discards all overlapping
            boxes with IoU < iou_threshold
@@ -34,7 +35,8 @@ def batched_nms(boxes, scores, idxs, iou_threshold):
    will not be applied between elements of different categories.

    Arguments:
-        boxes (Tensor[N, 4]): boxes where NMS will be performed
+        boxes (Tensor[N, 4]): boxes where NMS will be performed. They
+            are expected to be in (x1, y1, x2, y2) format
        scores (Tensor[N]): scores for each one of the boxes
        idxs (Tensor[N]): indices of the categories for each
            one of the boxes.
@@ -64,7 +66,7 @@ def remove_small_boxes(boxes, min_size):
    Remove boxes which contains at least one side smaller than min_size.

    Arguments:
-        boxes (Tensor[N, 4]): boxes in [x0, y0, x1, y1] format
+        boxes (Tensor[N, 4]): boxes in (x1, y1, x2, y2) format
        min_size (int): minimum size

    Returns:
@@ -82,7 +84,7 @@ def clip_boxes_to_image(boxes, size):
    Clip boxes so that they lie inside an image of size `size`.

    Arguments:
-        boxes (Tensor[N, 4]): boxes in [x0, y0, x1, y1] format
+        boxes (Tensor[N, 4]): boxes in (x1, y1, x2, y2) format
        size (Tuple[height, width]): size of the image

    Returns:
@@ -101,11 +103,11 @@ def clip_boxes_to_image(boxes, size):
 def box_area(boxes):
    """
    Computes the area of a set of bounding boxes, which are specified by its
-    (x0, y0, x1, y1) coordinates.
+    (x1, y1, x2, y2) coordinates.

    Arguments:
        boxes (Tensor[N, 4]): boxes for which the area will be computed. They
-            are expected to be in (x0, y0, x1, y1) format
+            are expected to be in (x1, y1, x2, y2) format

    Returns:
        area (Tensor[N]): area for each box
@@ -119,6 +121,8 @@ def box_iou(boxes1, boxes2):
    """
    Return intersection-over-union (Jaccard index) of boxes.

+    Both sets of boxes are expected to be in (x1, y1, x2, y2) format.
+
    Arguments:
        boxes1 (Tensor[N, 4])
        boxes2 (Tensor[M, 4])

--- a/torchvision/ops/poolers.py
+++ b/torchvision/ops/poolers.py
@@ -119,7 +119,7 @@ class MultiScaleRoIAlign(nn.Module):
            x (OrderedDict[Tensor]): feature maps for each level. They are assumed to have
                all the same number of channels, but they can have different sizes.
            boxes (List[Tensor[N, 4]]): boxes to be used to perform the pooling operation, in
-                [x0, y0, x1, y1] format and in the image reference size, not the feature map
+                (x1, y1, x2, y2) format and in the image reference size, not the feature map
                reference.
            image_shapes (List[Tuple[height, width]]): the sizes of each image before they
                have been fed to a CNN to obtain feature maps. This allows us to infer the

--- a/torchvision/ops/roi_align.py
+++ b/torchvision/ops/roi_align.py
@@ -45,7 +45,7 @@ def roi_align(input, boxes, output_size, spatial_scale=1.0, sampling_ratio=-1):

    Arguments:
        input (Tensor[N, C, H, W]): input tensor
-        boxes (Tensor[K, 5] or List[Tensor[L, 4]]): the box coordinates in x1,y1,x2,y2
+        boxes (Tensor[K, 5] or List[Tensor[L, 4]]): the box coordinates in (x1, y1, x2, y2)
            format where the regions will be taken from. If a single Tensor is passed,
            then the first column should contain the batch index. If a list of Tensors
            is passed, then each Tensor will correspond to the boxes for an element i

--- a/torchvision/ops/roi_pool.py
+++ b/torchvision/ops/roi_pool.py
@@ -43,7 +43,7 @@ def roi_pool(input, boxes, output_size, spatial_scale=1.0):

    Arguments:
        input (Tensor[N, C, H, W]): input tensor
-        boxes (Tensor[K, 5] or List[Tensor[L, 4]]): the box coordinates in x1,y1,x2,y2
+        boxes (Tensor[K, 5] or List[Tensor[L, 4]]): the box coordinates in (x1, y1, x2, y2)
            format where the regions will be taken from. If a single Tensor is passed,
            then the first column should contain the batch index. If a list of Tensors
            is passed, then each Tensor will correspond to the boxes for an element i