encoder_decoder.py 23 KB
Newer Older
dingchang's avatar
dingchang committed
1
# Copyright (c) OpenMMLab. All rights reserved.
2
from typing import Dict, List, Tuple
3

4
5
import numpy as np
import torch
6
from torch import Tensor
7
8
9
from torch import nn as nn
from torch.nn import functional as F

10
from mmdet3d.registry import MODELS
zhangshilong's avatar
zhangshilong committed
11
12
13
from mmdet3d.utils import ConfigType, OptConfigType, OptMultiConfig
from ...structures.det3d_data_sample import OptSampleList, SampleList
from ..utils import add_prefix
14
15
16
from .base import Base3DSegmentor


17
@MODELS.register_module()
18
19
20
21
22
class EncoderDecoder3D(Base3DSegmentor):
    """3D Encoder Decoder segmentors.

    EncoderDecoder typically consists of backbone, decode_head, auxiliary_head.
    Note that auxiliary_head is only used for deep supervision during training,
23
24
25
26
27
28
29
30
31
    which could be dumped during inference.

    1. The ``loss`` method is used to calculate the loss of model,
    which includes two steps: (1) Extracts features to obtain the feature maps
    (2) Call the decode head loss function to forward decode head model and
    calculate losses.

    .. code:: text

32
33
34
    loss(): extract_feat() -> _decode_head_forward_train() -> _auxiliary_head_forward_train (optional)
    _decode_head_forward_train(): decode_head.loss()
    _auxiliary_head_forward_train(): auxiliary_head.loss (optional)
35
36
37
38

    2. The ``predict`` method is used to predict segmentation results,
    which includes two steps: (1) Run inference function to obtain the list of
    seg_logits (2) Call post-processing function to obtain list of
39
    ``Det3DDataSample`` including ``pred_pts_seg``.
40
41
42

    .. code:: text

43
44
45
46
    predict(): inference() -> postprocess_result()
    inference(): whole_inference()/slide_inference()
    whole_inference()/slide_inference(): encoder_decoder()
    encoder_decoder(): extract_feat() -> decode_head.predict()
47
48
49

    4 The ``_forward`` method is used to output the tensor by running the model,
    which includes two steps: (1) Extracts features to obtain the feature maps
50
    (2) Call the decode head forward function to forward decode head model.
51
52
53

    .. code:: text

54
    _forward(): extract_feat() -> _decode_head.forward()
55
56

    Args:
57
58
59
60
61
62
63
64
        backbone (dict or :obj:`ConfigDict`): The config for the backnone of
            segmentor.
        decode_head (dict or :obj:`ConfigDict`): The config for the decode
            head of segmentor.
        neck (dict or :obj:`ConfigDict`, optional): The config for the neck of
            segmentor. Defaults to None.
        auxiliary_head (dict or :obj:`ConfigDict` or List[dict or
            :obj:`ConfigDict`], optional): The config for the auxiliary head of
65
            segmentor. Defaults to None.
66
67
        loss_regularization (dict or :obj:`ConfigDict` or List[dict or
            :obj:`ConfigDict`], optional): The config for the regularization
68
            loass. Defaults to None.
69
70
71
72
73
74
75
76
77
78
        train_cfg (dict or :obj:`ConfigDict`, optional): The config for
            training. Defaults to None.
        test_cfg (dict or :obj:`ConfigDict`, optional): The config for testing.
            Defaults to None.
        data_preprocessor (dict or :obj:`ConfigDict`, optional): The
            pre-process config of :class:`BaseDataPreprocessor`.
            Defaults to None.
        init_cfg (dict or :obj:`ConfigDict` or List[dict or :obj:`ConfigDict`],
            optional): The weight initialized config for :class:`BaseModule`.
            Defaults to None.
79
    """  # noqa: E501
80
81

    def __init__(self,
82
83
84
                 backbone: ConfigType,
                 decode_head: ConfigType,
                 neck: OptConfigType = None,
85
86
                 auxiliary_head: OptMultiConfig = None,
                 loss_regularization: OptMultiConfig = None,
87
88
89
                 train_cfg: OptConfigType = None,
                 test_cfg: OptConfigType = None,
                 data_preprocessor: OptConfigType = None,
90
                 init_cfg: OptMultiConfig = None) -> None:
91
92
        super(EncoderDecoder3D, self).__init__(
            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
93
        self.backbone = MODELS.build(backbone)
94
        if neck is not None:
95
            self.neck = MODELS.build(neck)
96
97
        self._init_decode_head(decode_head)
        self._init_auxiliary_head(auxiliary_head)
98
        self._init_loss_regularization(loss_regularization)
99
100
101

        self.train_cfg = train_cfg
        self.test_cfg = test_cfg
102

103
104
105
        assert self.with_decode_head, \
            '3D EncoderDecoder Segmentor should have a decode_head'

106
    def _init_decode_head(self, decode_head: ConfigType) -> None:
107
        """Initialize ``decode_head``."""
108
        self.decode_head = MODELS.build(decode_head)
109
110
        self.num_classes = self.decode_head.num_classes

111
112
113
    def _init_auxiliary_head(self,
                             auxiliary_head: OptMultiConfig = None) -> None:
        """Initialize ``auxiliary_head``."""
114
115
116
117
        if auxiliary_head is not None:
            if isinstance(auxiliary_head, list):
                self.auxiliary_head = nn.ModuleList()
                for head_cfg in auxiliary_head:
118
                    self.auxiliary_head.append(MODELS.build(head_cfg))
119
            else:
120
                self.auxiliary_head = MODELS.build(auxiliary_head)
121

122
    def _init_loss_regularization(self,
123
124
125
                                  loss_regularization: OptMultiConfig = None
                                  ) -> None:
        """Initialize ``loss_regularization``."""
126
127
128
129
        if loss_regularization is not None:
            if isinstance(loss_regularization, list):
                self.loss_regularization = nn.ModuleList()
                for loss_cfg in loss_regularization:
130
                    self.loss_regularization.append(MODELS.build(loss_cfg))
131
            else:
132
                self.loss_regularization = MODELS.build(loss_regularization)
133

134
    def extract_feat(self, batch_inputs: Tensor) -> dict:
135
        """Extract features from points."""
136
        x = self.backbone(batch_inputs)
137
138
139
140
        if self.with_neck:
            x = self.neck(x)
        return x

141
142
    def encode_decode(self, batch_inputs: Tensor,
                      batch_input_metas: List[dict]) -> Tensor:
143
144
145
146
        """Encode points with backbone and decode into a semantic segmentation
        map of the same size as input.

        Args:
147
148
149
            batch_input (Tensor): Input point cloud sample
            batch_input_metas (List[dict]): Meta information of a batch of
                samples.
150
151

        Returns:
152
            Tensor: Segmentation logits of shape [B, num_classes, N].
153
        """
154
        x = self.extract_feat(batch_inputs)
155
156
157
        seg_logits = self.decode_head.predict(x, batch_input_metas,
                                              self.test_cfg)
        return seg_logits
158

159
160
161
162
163
164
165
166
167
168
169
170
171
172
    def _decode_head_forward_train(
            self, batch_inputs_dict: dict,
            batch_data_samples: SampleList) -> Dict[str, Tensor]:
        """Run forward function and calculate loss for decode head in training.

        Args:
            batch_input (Tensor): Input point cloud sample
            batch_data_samples (List[:obj:`Det3DDataSample`]): The det3d data
                samples. It usually includes information such as `metainfo` and
                `gt_pts_seg`.

        Returns:
            Dict[str, Tensor]: A dictionary of loss components for decode head.
        """
173
        losses = dict()
174
175
        loss_decode = self.decode_head.loss(batch_inputs_dict,
                                            batch_data_samples, self.train_cfg)
176
177
178
179

        losses.update(add_prefix(loss_decode, 'decode'))
        return losses

180
181
182
183
    def _auxiliary_head_forward_train(
        self,
        batch_inputs_dict: dict,
        batch_data_samples: SampleList,
184
    ) -> Dict[str, Tensor]:
185
        """Run forward function and calculate loss for auxiliary head in
186
187
188
189
190
191
192
193
194
195
196
197
        training.

        Args:
            batch_input (Tensor): Input point cloud sample
            batch_data_samples (List[:obj:`Det3DDataSample`]): The det3d data
                samples. It usually includes information such as `metainfo` and
                `gt_pts_seg`.

        Returns:
            Dict[str, Tensor]: A dictionary of loss components for auxiliary
            head.
        """
198
199
200
        losses = dict()
        if isinstance(self.auxiliary_head, nn.ModuleList):
            for idx, aux_head in enumerate(self.auxiliary_head):
201
202
                loss_aux = aux_head.loss(batch_inputs_dict, batch_data_samples,
                                         self.train_cfg)
203
204
                losses.update(add_prefix(loss_aux, f'aux_{idx}'))
        else:
205
206
207
            loss_aux = self.auxiliary_head.loss(batch_inputs_dict,
                                                batch_data_samples,
                                                self.train_cfg)
208
209
210
211
            losses.update(add_prefix(loss_aux, 'aux'))

        return losses

212
    def _loss_regularization_forward_train(self) -> Dict[str, Tensor]:
213
214
215
216
217
218
219
220
221
222
223
224
225
226
        """Calculate regularization loss for model weight in training."""
        losses = dict()
        if isinstance(self.loss_regularization, nn.ModuleList):
            for idx, regularize_loss in enumerate(self.loss_regularization):
                loss_regularize = dict(
                    loss_regularize=regularize_loss(self.modules()))
                losses.update(add_prefix(loss_regularize, f'regularize_{idx}'))
        else:
            loss_regularize = dict(
                loss_regularize=self.loss_regularization(self.modules()))
            losses.update(add_prefix(loss_regularize, 'regularize'))

        return losses

227
    def loss(self, batch_inputs_dict: dict,
228
             batch_data_samples: SampleList) -> Dict[str, Tensor]:
229
        """Calculate losses from a batch of inputs and data samples.
230
231

        Args:
232
233
234
            batch_inputs_dict (dict): Input sample dict which
                includes 'points' and 'imgs' keys.

235
236
237
238
239
                - points (List[Tensor]): Point cloud of each sample.
                - imgs (Tensor, optional): Image tensor has shape (B, C, H, W).
            batch_data_samples (List[:obj:`Det3DDataSample`]): The det3d data
                samples. It usually includes information such as `metainfo` and
                `gt_pts_seg`.
240
241

        Returns:
242
            Dict[str, Tensor]: A dictionary of loss components.
243
244
245
        """

        # extract features using backbone
246
247
        points = torch.stack(batch_inputs_dict['points'])
        x = self.extract_feat(points)
248
249
250

        losses = dict()

251
        loss_decode = self._decode_head_forward_train(x, batch_data_samples)
252
253
254
255
        losses.update(loss_decode)

        if self.with_auxiliary_head:
            loss_aux = self._auxiliary_head_forward_train(
256
                x, batch_data_samples)
257
258
            losses.update(loss_aux)

259
260
261
262
        if self.with_regularization_loss:
            loss_regularize = self._loss_regularization_forward_train()
            losses.update(loss_regularize)

263
264
265
266
        return losses

    @staticmethod
    def _input_generation(coords,
267
268
269
                          patch_center: Tensor,
                          coord_max: Tensor,
                          feats: Tensor,
270
                          use_normalized_coord: bool = False) -> Tensor:
271
272
        """Generating model input.

273
        Generate input by subtracting patch center and adding additional
274
        features. Currently support colors and normalized xyz as features.
275
276

        Args:
277
278
279
280
281
282
            coords (Tensor): Sampled 3D point coordinate of shape [S, 3].
            patch_center (Tensor): Center coordinate of the patch.
            coord_max (Tensor): Max coordinate of all 3D points.
            feats (Tensor): Features of sampled points of shape [S, C].
            use_normalized_coord (bool): Whether to use normalized xyz as
                additional features. Defaults to False.
283
284

        Returns:
285
            Tensor: The generated input data of shape [S, 3+C'].
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
        """
        # subtract patch center, the z dimension is not centered
        centered_coords = coords.clone()
        centered_coords[:, 0] -= patch_center[0]
        centered_coords[:, 1] -= patch_center[1]

        # normalized coordinates as extra features
        if use_normalized_coord:
            normalized_coord = coords / coord_max
            feats = torch.cat([feats, normalized_coord], dim=1)

        points = torch.cat([centered_coords, feats], dim=1)

        return points

    def _sliding_patch_generation(self,
302
303
304
305
306
                                  points: Tensor,
                                  num_points: int,
                                  block_size: float,
                                  sample_rate: float = 0.5,
                                  use_normalized_coord: bool = False,
307
                                  eps: float = 1e-3) -> Tuple[Tensor, Tensor]:
308
309
310
311
312
313
        """Sampling points in a sliding window fashion.

        First sample patches to cover all the input points.
        Then sample points in each patch to batch points of a certain number.

        Args:
314
            points (Tensor): Input points of shape [N, 3+C].
315
            num_points (int): Number of points to be sampled in each patch.
316
317
318
319
320
321
            block_size (float): Size of a patch to sample.
            sample_rate (float): Stride used in sliding patch. Defaults to 0.5.
            use_normalized_coord (bool): Whether to use normalized xyz as
                additional features. Defaults to False.
            eps (float): A value added to patch boundary to guarantee points
                coverage. Defaults to 1e-3.
322
323

        Returns:
324
            Tuple[Tensor, Tensor]:
325

326
327
328
329
            - patch_points (Tensor): Points of different patches of shape
              [K, N, 3+C].
            - patch_idxs (Tensor): Index of each point in `patch_points` of
              shape [K, N].
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
        """
        device = points.device
        # we assume the first three dims are points' 3D coordinates
        # and the rest dims are their per-point features
        coords = points[:, :3]
        feats = points[:, 3:]

        coord_max = coords.max(0)[0]
        coord_min = coords.min(0)[0]
        stride = block_size * sample_rate
        num_grid_x = int(
            torch.ceil((coord_max[0] - coord_min[0] - block_size) /
                       stride).item() + 1)
        num_grid_y = int(
            torch.ceil((coord_max[1] - coord_min[1] - block_size) /
                       stride).item() + 1)

        patch_points, patch_idxs = [], []
        for idx_y in range(num_grid_y):
            s_y = coord_min[1] + idx_y * stride
            e_y = torch.min(s_y + block_size, coord_max[1])
            s_y = e_y - block_size
            for idx_x in range(num_grid_x):
                s_x = coord_min[0] + idx_x * stride
                e_x = torch.min(s_x + block_size, coord_max[0])
                s_x = e_x - block_size

                # extract points within this patch
                cur_min = torch.tensor([s_x, s_y, coord_min[2]]).to(device)
                cur_max = torch.tensor([e_x, e_y, coord_max[2]]).to(device)
                cur_choice = ((coords >= cur_min - eps) &
                              (coords <= cur_max + eps)).all(dim=1)

                if not cur_choice.any():  # no points in this patch
                    continue

                # sample points in this patch to multiple batches
                cur_center = cur_min + block_size / 2.0
                point_idxs = torch.nonzero(cur_choice, as_tuple=True)[0]
                num_batch = int(np.ceil(point_idxs.shape[0] / num_points))
                point_size = int(num_batch * num_points)
                replace = point_size > 2 * point_idxs.shape[0]
                num_repeat = point_size - point_idxs.shape[0]
                if replace:  # duplicate
                    point_idxs_repeat = point_idxs[torch.randint(
                        0, point_idxs.shape[0],
                        size=(num_repeat, )).to(device)]
                else:
                    point_idxs_repeat = point_idxs[torch.randperm(
                        point_idxs.shape[0])[:num_repeat]]

                choices = torch.cat([point_idxs, point_idxs_repeat], dim=0)
                choices = choices[torch.randperm(choices.shape[0])]

                # construct model input
                point_batches = self._input_generation(
                    coords[choices],
                    cur_center,
                    coord_max,
                    feats[choices],
                    use_normalized_coord=use_normalized_coord)

                patch_points.append(point_batches)
                patch_idxs.append(choices)

        patch_points = torch.cat(patch_points, dim=0)
        patch_idxs = torch.cat(patch_idxs, dim=0)

        # make sure all points are sampled at least once
        assert torch.unique(patch_idxs).shape[0] == points.shape[0], \
            'some points are not sampled in sliding inference'

        return patch_points, patch_idxs

404
    def slide_inference(self, point: Tensor, input_meta: dict,
405
                        rescale: bool) -> Tensor:
406
407
408
        """Inference by sliding-window with overlap.

        Args:
409
410
            point (Tensor): Input points of shape [N, 3+C].
            input_meta (dict): Meta information of input sample.
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
            rescale (bool): Whether transform to original number of points.
                Will be used for voxelization based segmentors.

        Returns:
            Tensor: The output segmentation map of shape [num_classes, N].
        """
        num_points = self.test_cfg.num_points
        block_size = self.test_cfg.block_size
        sample_rate = self.test_cfg.sample_rate
        use_normalized_coord = self.test_cfg.use_normalized_coord
        batch_size = self.test_cfg.batch_size * num_points

        # patch_points is of shape [K*N, 3+C], patch_idxs is of shape [K*N]
        patch_points, patch_idxs = self._sliding_patch_generation(
            point, num_points, block_size, sample_rate, use_normalized_coord)
        feats_dim = patch_points.shape[1]
        seg_logits = []  # save patch predictions

        for batch_idx in range(0, patch_points.shape[0], batch_size):
            batch_points = patch_points[batch_idx:batch_idx + batch_size]
            batch_points = batch_points.view(-1, num_points, feats_dim)
            # batch_seg_logit is of shape [B, num_classes, N]
433
434
            batch_seg_logit = self.encode_decode(batch_points,
                                                 [input_meta] * batch_size)
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
            batch_seg_logit = batch_seg_logit.transpose(1, 2).contiguous()
            seg_logits.append(batch_seg_logit.view(-1, self.num_classes))

        # aggregate per-point logits by indexing sum and dividing count
        seg_logits = torch.cat(seg_logits, dim=0)  # [K*N, num_classes]
        expand_patch_idxs = patch_idxs.unsqueeze(1).repeat(1, self.num_classes)
        preds = point.new_zeros((point.shape[0], self.num_classes)).\
            scatter_add_(dim=0, index=expand_patch_idxs, src=seg_logits)
        count_mat = torch.bincount(patch_idxs)
        preds = preds / count_mat[:, None]

        # TODO: if rescale and voxelization segmentor

        return preds.transpose(0, 1)  # to [num_classes, K*N]

450
    def whole_inference(self, points: Tensor, batch_input_metas: List[dict],
451
                        rescale: bool) -> Tensor:
452
        """Inference with full scene (one forward pass without sliding)."""
453
        seg_logit = self.encode_decode(points, batch_input_metas)
454
455
456
        # TODO: if rescale and voxelization segmentor
        return seg_logit

457
    def inference(self, points: Tensor, batch_input_metas: List[dict],
458
                  rescale: bool) -> Tensor:
459
460
461
        """Inference with slide/whole style.

        Args:
462
463
464
            points (Tensor): Input points of shape [B, N, 3+C].
            batch_input_metas (List[dict]): Meta information of a batch of
                samples.
465
466
467
468
469
470
471
472
473
            rescale (bool): Whether transform to original number of points.
                Will be used for voxelization based segmentors.

        Returns:
            Tensor: The output segmentation map.
        """
        assert self.test_cfg.mode in ['slide', 'whole']
        if self.test_cfg.mode == 'slide':
            seg_logit = torch.stack([
474
475
                self.slide_inference(point, input_meta, rescale)
                for point, input_meta in zip(points, batch_input_metas)
476
477
            ], 0)
        else:
478
479
            seg_logit = self.whole_inference(points, batch_input_metas,
                                             rescale)
480
481
482
        output = F.softmax(seg_logit, dim=1)
        return output

483
484
485
486
    def predict(self,
                batch_inputs_dict: dict,
                batch_data_samples: SampleList,
                rescale: bool = True) -> SampleList:
487
488
489
        """Simple test with single scene.

        Args:
490
491
492
493
494
495
496
497
            batch_inputs_dict (dict): Input sample dict which includes 'points'
                and 'imgs' keys.

                - points (List[Tensor]): Point cloud of each sample.
                - imgs (Tensor, optional): Image tensor has shape (B, C, H, W).
            batch_data_samples (List[:obj:`Det3DDataSample`]): The det3d data
                samples. It usually includes information such as `metainfo` and
                `gt_pts_seg`.
498
499
500
501
502
            rescale (bool): Whether transform to original number of points.
                Will be used for voxelization based segmentors.
                Defaults to True.

        Returns:
503
504
            List[:obj:`Det3DDataSample`]: Segmentation results of the input
            points. Each Det3DDataSample usually contains:
505

506
507
            - ``pred_pts_seg`` (PixelData): Prediction of 3D semantic
              segmentation.
508
509
510
511
        """
        # 3D segmentation requires per-point prediction, so it's impossible
        # to use down-sampling to get a batch of scenes with same num_points
        # therefore, we only support testing one scene every time
512
        seg_pred_list = []
513
514
515
516
517
518
519
520
        batch_input_metas = []
        for data_sample in batch_data_samples:
            batch_input_metas.append(data_sample.metainfo)

        points = batch_inputs_dict['points']
        for point, input_meta in zip(points, batch_input_metas):
            seg_prob = self.inference(
                point.unsqueeze(0), [input_meta], rescale)[0]
521
522
523
            seg_map = seg_prob.argmax(0)  # [N]
            # to cpu tensor for consistency with det3d
            seg_map = seg_map.cpu()
524
525
            seg_pred_list.append(seg_map)

526
        return self.postprocess_result(seg_pred_list, batch_data_samples)
527

528
529
530
531
    def _forward(self,
                 batch_inputs_dict: dict,
                 batch_data_samples: OptSampleList = None) -> Tensor:
        """Network forward process.
532
533

        Args:
534
535
536
537
538
539
540
541
            batch_inputs_dict (dict): Input sample dict which includes 'points'
                and 'imgs' keys.

                - points (List[Tensor]): Point cloud of each sample.
                - imgs (Tensor, optional): Image tensor has shape (B, C, H, W).
            batch_data_samples (List[:obj:`Det3DDataSample`]): The det3d data
                samples. It usually includes information such as `metainfo` and
                `gt_pts_seg`.
542

543
544
        Returns:
            Tensor: Forward output of model without any post-processes.
545
        """
546
547
        points = torch.stack(batch_inputs_dict['points'])
        x = self.extract_feat(points)
548
        return self.decode_head.forward(x)
549
550
551
552

    def aug_test(self, batch_inputs, batch_img_metas):
        """Placeholder for augmentation test."""
        pass