coder.py 10.2 KB
Newer Older
mibaumgartner's avatar
mibaumgartner committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
"""
Parts of this code are from torchvision and thus licensed under

BSD 3-Clause License

Copyright (c) Soumith Chintala 2016, 
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

* Redistributions of source code must retain the above copyright notice, this
  list of conditions and the following disclaimer.

* Redistributions in binary form must reproduce the above copyright notice,
  this list of conditions and the following disclaimer in the documentation
  and/or other materials provided with the distribution.

* Neither the name of the copyright holder nor the names of its
  contributors may be used to endorse or promote products derived from
  this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

"""

mibaumgartner's avatar
core  
mibaumgartner committed
36
37
38
from __future__ import division

import math
mibaumgartner's avatar
models  
mibaumgartner committed
39
from typing import Sequence, TypeVar
mibaumgartner's avatar
core  
mibaumgartner committed
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272

import torch
from torch.jit.annotations import List, Tuple
from torch import Tensor

from torchvision.models.detection._utils import BoxCoder


@torch.jit.script
def encode_boxes(reference_boxes: torch.Tensor,
                 proposals: torch.Tensor,
                 weights: torch.Tensor,
                 ) -> torch.Tensor:
    """
    Encode a set of proposals with respect to some reference boxes

    Args:
        reference_boxes: reference boxes (x1, y1, x2, y2, (z1, z2))
        proposals: boxes to be encoded (x1, y1, x2, y2, (z1, z2))
        weights: weights for dimensions (wx, wy, ww, wh, wz, wd)
    """
    # perform some unpacking to make it JIT-fusion friendly
    wx = weights[0]
    wy = weights[1]
    ww = weights[2]
    wh = weights[3]

    proposals_x1 = proposals[:, 0].unsqueeze(1)
    proposals_y1 = proposals[:, 1].unsqueeze(1)
    proposals_x2 = proposals[:, 2].unsqueeze(1)
    proposals_y2 = proposals[:, 3].unsqueeze(1)

    reference_boxes_x1 = reference_boxes[:, 0].unsqueeze(1)
    reference_boxes_y1 = reference_boxes[:, 1].unsqueeze(1)
    reference_boxes_x2 = reference_boxes[:, 2].unsqueeze(1)
    reference_boxes_y2 = reference_boxes[:, 3].unsqueeze(1)

    # implementation starts here
    ex_widths = proposals_x2 - proposals_x1
    ex_heights = proposals_y2 - proposals_y1
    ex_ctr_x = proposals_x1 + 0.5 * ex_widths
    ex_ctr_y = proposals_y1 + 0.5 * ex_heights

    gt_widths = reference_boxes_x2 - reference_boxes_x1
    gt_heights = reference_boxes_y2 - reference_boxes_y1
    gt_ctr_x = reference_boxes_x1 + 0.5 * gt_widths
    gt_ctr_y = reference_boxes_y1 + 0.5 * gt_heights

    targets_dx = wx * (gt_ctr_x - ex_ctr_x) / ex_widths
    targets_dy = wy * (gt_ctr_y - ex_ctr_y) / ex_heights
    targets_dw = ww * torch.log(gt_widths / ex_widths)
    targets_dh = wh * torch.log(gt_heights / ex_heights)

    if proposals.shape[1] == 6:
        wz = weights[4]
        wd = weights[5]

        proposals_z1 = proposals[:, 4].unsqueeze(1)
        proposals_z2 = proposals[:, 5].unsqueeze(1)
        ex_depth = proposals_z2 - proposals_z1
        ex_ctr_z = proposals_z1 + 0.5 * ex_depth

        reference_boxes_z1 = reference_boxes[:, 4].unsqueeze(1)
        reference_boxes_z2 = reference_boxes[:, 5].unsqueeze(1)
        gt_depth = reference_boxes_z2 - reference_boxes_z1
        gt_ctr_z = reference_boxes_z1 + 0.5 * gt_depth

        targets_dz = wz * (gt_ctr_z - ex_ctr_z) / ex_depth
        targets_dd = wd * torch.log(gt_depth / ex_depth)

        targets = torch.cat((targets_dx, targets_dy, targets_dw, targets_dh,
                             targets_dz, targets_dd), dim=1)
    else:
        targets = torch.cat((targets_dx, targets_dy, targets_dw, targets_dh), dim=1)
    return targets


def decode_single(rel_codes: Tensor, boxes: Tensor,
                  weights: Sequence[float],
                  bbox_xform_clip: float) -> Tensor:
    """
    From a set of original boxes and encoded relative box offsets,
    get the decoded boxes.

    Args:
        rel_codes: encoded boxes [Num_boxes x (dim * 2)] (dx, dy, dw, dh, dz, dd)
        boxes: reference boxes (x1, y1, x2, y2, (z1, z2))
    """
    # offset is 4 in case of 2d data and 6 in case of 3d
    offset = boxes.shape[1]
    boxes = boxes.to(rel_codes.dtype)

    widths = boxes[:, 2] - boxes[:, 0]
    heights = boxes[:, 3] - boxes[:, 1]
    ctr_x = boxes[:, 0] + 0.5 * widths
    ctr_y = boxes[:, 1] + 0.5 * heights

    wx = weights[0]
    wy = weights[1]
    ww = weights[2]
    wh = weights[3]

    dx = rel_codes[:, 0::offset] / wx
    dy = rel_codes[:, 1::offset] / wy
    dw = rel_codes[:, 2::offset] / ww
    dh = rel_codes[:, 3::offset] / wh

    # Prevent sending too large values into torch.exp()
    dw = torch.clamp(dw, max=bbox_xform_clip)
    dh = torch.clamp(dh, max=bbox_xform_clip)

    pred_ctr_x = dx * widths[:, None] + ctr_x[:, None]
    pred_ctr_y = dy * heights[:, None] + ctr_y[:, None]
    pred_w = torch.exp(dw) * widths[:, None]
    pred_h = torch.exp(dh) * heights[:, None]

    pred_boxes1 = pred_ctr_x - torch.tensor(0.5, dtype=pred_ctr_x.dtype) * pred_w
    pred_boxes2 = pred_ctr_y - torch.tensor(0.5, dtype=pred_ctr_y.dtype) * pred_h
    pred_boxes3 = pred_ctr_x + torch.tensor(0.5, dtype=pred_ctr_x.dtype) * pred_w
    pred_boxes4 = pred_ctr_y + torch.tensor(0.5, dtype=pred_ctr_y.dtype) * pred_h

    if offset == 6:
        depths = boxes[:, 5] - boxes[:, 4]
        ctr_z = boxes[:, 4] + 0.5 * depths

        wz = weights[4]
        wd = weights[5]

        dz = rel_codes[:, 4::offset] / wz
        dd = rel_codes[:, 5::offset] / wd
        dd = torch.clamp(dd, max=bbox_xform_clip)

        pred_ctr_z = dz * depths[:, None] + ctr_z[:, None]
        pred_z = torch.exp(dd) * depths[:, None]

        pred_boxes5 = pred_ctr_z - torch.tensor(0.5, dtype=pred_ctr_z.dtype) * pred_z
        pred_boxes6 = pred_ctr_z + torch.tensor(0.5, dtype=pred_ctr_z.dtype) * pred_z
        pred_boxes = torch.stack((pred_boxes1, pred_boxes2, pred_boxes3, pred_boxes4,
                                  pred_boxes5, pred_boxes6), dim=2).flatten(1)
    else:
        pred_boxes = torch.stack((pred_boxes1, pred_boxes2, pred_boxes3, pred_boxes4),
                                 dim=2).flatten(1)
    return pred_boxes


class BoxCoderND(BoxCoder):
    """
    This class encodes and decodes a set of bounding boxes into
    the representation used for training the regressors.
    Compatible with 2d and 3d
    """
    def encode(self,
               reference_boxes: List[Tensor], 
               proposals: List[Tensor],
               ) -> Tuple[Tensor]:
        """
        Encode a set of proposals with respect to some reference boxes

        Args:
            reference_boxes: reference boxes for each image.
                (x1, y1, x2, y2, (z1, z2))
            proposals: proposals for each image
                (x1, y1, x2, y2, (z1, z2))

        Returns:
            Tuple[Tensor]: regression targets for each image
        """
        # filter for images which have a foreground class
        filter_min_one_gt = [rb.numel() > 0 for rb in reference_boxes]
        filtered_ref_boxes = [
            rb for idx, rb in enumerate(reference_boxes) if filter_min_one_gt[idx]]
        filtered_proposals = [
            pr for idx, pr in enumerate(proposals) if filter_min_one_gt[idx]]

        if any(filter_min_one_gt):
            filtered_encoded = super().encode(filtered_ref_boxes, filtered_proposals)

        # fill image with no ground truth
        idx_enc = 0
        encoded = []
        for img_idx, gt_present in enumerate(filter_min_one_gt):
            if gt_present:
                encoded.append(filtered_encoded[idx_enc])
                idx_enc += 1
            else:
                # fill with zeros because they  do not contribute to the
                # regression loss anyway (all anchors are labeled as background)
                encoded.append(torch.zeros_like(proposals[img_idx]))
        return encoded

    def encode_single(self,
                      reference_boxes: Tensor,
                      proposals: Tensor,
                      ) -> Tensor:
        """
        Encode a set of proposals with respect to some reference boxes

        Arguments:
            reference_boxes: reference boxes  (x1, y1, x2, y2, (z1, z2))
            proposals: boxes to be encoded  (x1, y1, x2, y2, (z1, z2))
        """
        dtype, device = reference_boxes.dtype, reference_boxes.device
        weights = torch.tensor(self.weights, dtype=dtype, device=device)
        targets = encode_boxes(reference_boxes, proposals, weights)
        return targets

    def decode(self, rel_codes: Tensor, boxes: List[Tensor]) -> Tensor:
        """
        Decode boxes

        Args:
            rel_codes: relative offsets to reference boxes 
                (dx, dy, dw, dh, (dz, dd))[N, dim * 2]
            boxes: list of reference boxes per image
                (x1, y1, x2, y2, (z1, z2))

        Returns:
            Tensor: decoded boxes
        """
        assert isinstance(boxes, (list, tuple))
        assert isinstance(rel_codes, torch.Tensor)
        boxes_per_image = [b.size(0) for b in boxes]
        concat_boxes = torch.cat(boxes, dim=0)
        spatial_dims = concat_boxes.shape[1]
        box_sum = 0
        for val in boxes_per_image:
            box_sum += val
        pred_boxes = self.decode_single(rel_codes.reshape(box_sum, -1), concat_boxes)
        return pred_boxes.reshape(box_sum, spatial_dims)

    def decode_single(self, rel_codes: torch.Tensor, boxes: torch.Tensor):
        dtype, device = rel_codes.dtype, rel_codes.device
        return decode_single(rel_codes, boxes, self.weights, self.bbox_xform_clip)
mibaumgartner's avatar
models  
mibaumgartner committed
273
274
275


CoderType = TypeVar('CoderType', bound=BoxCoderND)