transforms.py 6.84 KB
Newer Older
Kai Chen's avatar
Kai Chen committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
import mmcv
# import cvbase as cvb
import numpy as np
import torch

from mmdet.core import segms

__all__ = [
    'ImageTransform', 'BboxTransform', 'PolyMaskTransform', 'Numpy2Tensor'
]


class ImageTransform(object):
    """Preprocess an image
    1. rescale the image to expected size
    2. normalize the image
    3. flip the image (if needed)
    4. pad the image (if needed)
    5. transpose to (c, h, w)
    """

    def __init__(self,
                 mean=(0, 0, 0),
                 std=(1, 1, 1),
                 to_rgb=True,
                 size_divisor=None):
        self.mean = np.array(mean, dtype=np.float32)
        self.std = np.array(std, dtype=np.float32)
        self.to_rgb = to_rgb
        self.size_divisor = size_divisor

    def __call__(self, img, scale, flip=False):
        img, scale_factor = mmcv.imrescale(img, scale, True)
        img_shape = img.shape
        img = mmcv.imnorm(img, self.mean, self.std, self.to_rgb)
        if flip:
            img = mmcv.imflip(img)
        if self.size_divisor is not None:
            img = mmcv.impad_to_multiple(img, self.size_divisor)
        img = img.transpose(2, 0, 1)
        return img, img_shape, scale_factor

        # img, scale = cvb.resize_keep_ar(img_or_path, max_long_edge,
        #                                 max_short_edge, True)
        # shape_scale = np.array(img.shape + (scale, ), dtype=np.float32)
        # if flip:
        #     img = img[:, ::-1, :].copy()
        # if self.color_order == 'RGB':
        #     img = cvb.bgr2rgb(img)
        # img = img.astype(np.float32)
        # img -= self.color_mean
        # img /= self.color_std
        # if self.size_divisor is None:
        #     padded_img = img
        # else:
        #     pad_h = int(np.ceil(
        #         img.shape[0] / self.size_divisor)) * self.size_divisor
        #     pad_w = int(np.ceil(
        #         img.shape[1] / self.size_divisor)) * self.size_divisor
        #     padded_img = cvb.pad_img(img, (pad_h, pad_w), pad_val=0)
        # padded_img = padded_img.transpose(2, 0, 1)
        # return padded_img, shape_scale


class ImageCrop(object):
    """crop image patches and resize patches into fixed size
    1. (read and) flip image (if needed) 
    2. crop image patches according to given bboxes
    3. resize patches into fixed size (default 224x224)
    4. normalize the image (if needed)
    5. transpose to (c, h, w) (if needed)
    """

    def __init__(self,
                 normalize=True,
                 transpose=True,
                 color_order='RGB',
                 color_mean=(0, 0, 0),
                 color_std=(1, 1, 1)):
        self.normalize = normalize
        self.transpose = transpose

        assert color_order in ['RGB', 'BGR']
        self.color_order = color_order
        self.color_mean = np.array(color_mean, dtype=np.float32)
        self.color_std = np.array(color_std, dtype=np.float32)

    def __call__(self,
                 img_or_path,
                 bboxes,
                 crop_size,
                 scale_ratio=1.0,
                 flip=False):
        img = cvb.read_img(img_or_path)
        if flip:
            img = img[:, ::-1, :].copy()
        crop_imgs = cvb.crop_img(
            img,
            bboxes[:, :4],
            scale_ratio=scale_ratio,
            pad_fill=self.color_mean)
        processed_crop_imgs_list = []
        for i in range(len(crop_imgs)):
            crop_img = crop_imgs[i]
            crop_img = cvb.resize(crop_img, crop_size)
            crop_img = crop_img.astype(np.float32)
            crop_img -= self.color_mean
            crop_img /= self.color_std
            processed_crop_imgs_list.append(crop_img)
        processed_crop_imgs = np.stack(processed_crop_imgs_list, axis=0)
        processed_crop_imgs = processed_crop_imgs.transpose(0, 3, 1, 2)
        return processed_crop_imgs


class BboxTransform(object):
    """Preprocess gt bboxes
    1. rescale bboxes according to image size
    2. flip bboxes (if needed)
    3. pad the first dimension to `max_num_gts`
    """

    def __init__(self, max_num_gts=None):
        self.max_num_gts = max_num_gts

    def __call__(self, bboxes, img_shape, scale_factor, flip=False):
        gt_bboxes = bboxes * scale_factor
        if flip:
            gt_bboxes = mmcv.bbox_flip(gt_bboxes, img_shape)
        if self.max_num_gts is None:
            return gt_bboxes
        else:
            num_gts = gt_bboxes.shape[0]
            padded_bboxes = np.zeros((self.max_num_gts, 4), dtype=np.float32)
            padded_bboxes[:num_gts, :] = gt_bboxes
            return padded_bboxes


class PolyMaskTransform(object):

    def __init__(self):
        pass

    def __call__(self, gt_mask_polys, gt_poly_lens, img_h, img_w, flip=False):
        """
        Args:
            gt_mask_polys(list): a list of masks, each mask is a list of polys,
                each poly is a list of numbers
            gt_poly_lens(list): a list of int, indicating the size of each poly
        """
        if flip:
            gt_mask_polys = segms.flip_segms(gt_mask_polys, img_h, img_w)
        num_polys_per_mask = np.array(
            [len(mask_polys) for mask_polys in gt_mask_polys], dtype=np.int64)
        gt_poly_lens = np.array(gt_poly_lens, dtype=np.int64)
        gt_mask_polys = [
            np.concatenate(mask_polys).astype(np.float32)
            for mask_polys in gt_mask_polys
        ]
        gt_mask_polys = np.concatenate(gt_mask_polys)
        return gt_mask_polys, gt_poly_lens, num_polys_per_mask


class MaskTransform(object):
    """Preprocess masks
    1. resize masks to expected size and stack to a single array
    2. flip the masks (if needed)
    3. pad the masks (if needed)
    """

    def __init__(self, max_num_gts, pad_size=None):
        self.max_num_gts = max_num_gts
        self.pad_size = pad_size

    def __call__(self, masks, img_size, flip=False):
        max_long_edge = max(img_size)
        max_short_edge = min(img_size)
        masks = [
            cvb.resize_keep_ar(
                mask,
                max_long_edge,
                max_short_edge,
                interpolation=cvb.INTER_NEAREST) for mask in masks
        ]
        masks = np.stack(masks, axis=0)
        if flip:
            masks = masks[:, ::-1, :]
        if self.pad_size is None:
            pad_h = masks.shape[1]
            pad_w = masks.shape[2]
        else:
            pad_size = self.pad_size if self.pad_size > 0 else max_long_edge
            pad_h = pad_w = pad_size
        padded_masks = np.zeros(
            (self.max_num_gts, pad_h, pad_w), dtype=masks.dtype)
        padded_masks[:masks.shape[0], :masks.shape[1], :masks.shape[2]] = masks
        return padded_masks


class Numpy2Tensor(object):

    def __init__(self):
        pass

    def __call__(self, *args):
        if len(args) == 1:
            return torch.from_numpy(args[0])
        else:
            return tuple([torch.from_numpy(array) for array in args])