presets.py 5.08 KB
Newer Older
Ponku's avatar
Ponku committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
from typing import Optional, Tuple, Union

import torch
import transforms as T


class StereoMatchingEvalPreset(torch.nn.Module):
    def __init__(
        self,
        mean: float = 0.5,
        std: float = 0.5,
        resize_size: Optional[Tuple[int, ...]] = None,
        max_disparity: Optional[float] = None,
        interpolation_type: str = "bilinear",
        use_grayscale: bool = False,
    ) -> None:
        super().__init__()

        transforms = [
            T.ToTensor(),
            T.ConvertImageDtype(torch.float32),
        ]

        if use_grayscale:
            transforms.append(T.ConvertToGrayscale())

        if resize_size is not None:
            transforms.append(T.Resize(resize_size, interpolation_type=interpolation_type))

        transforms.extend(
            [
                T.Normalize(mean=mean, std=std),
                T.MakeValidDisparityMask(max_disparity=max_disparity),
                T.ValidateModelInput(),
            ]
        )

        self.transforms = T.Compose(transforms)

    def forward(self, images, disparities, masks):
        return self.transforms(images, disparities, masks)


class StereoMatchingTrainPreset(torch.nn.Module):
    def __init__(
        self,
        *,
        resize_size: Optional[Tuple[int, ...]],
        resize_interpolation_type: str = "bilinear",
        # RandomResizeAndCrop params
        crop_size: Tuple[int, int],
        rescale_prob: float = 1.0,
        scaling_type: str = "exponential",
        scale_range: Tuple[float, float] = (-0.2, 0.5),
        scale_interpolation_type: str = "bilinear",
        # convert to grayscale
        use_grayscale: bool = False,
        # normalization params
        mean: float = 0.5,
        std: float = 0.5,
        # processing device
        gpu_transforms: bool = False,
        # masking
        max_disparity: Optional[int] = 256,
        # SpatialShift params
        spatial_shift_prob: float = 0.5,
        spatial_shift_max_angle: float = 0.5,
        spatial_shift_max_displacement: float = 0.5,
        spatial_shift_interpolation_type: str = "bilinear",
        # AssymetricColorJitter
        gamma_range: Tuple[float, float] = (0.8, 1.2),
        brightness: Union[int, Tuple[int, int]] = (0.8, 1.2),
        contrast: Union[int, Tuple[int, int]] = (0.8, 1.2),
        saturation: Union[int, Tuple[int, int]] = 0.0,
        hue: Union[int, Tuple[int, int]] = 0.0,
        asymmetric_jitter_prob: float = 1.0,
        # RandomHorizontalFlip
        horizontal_flip_prob: float = 0.5,
        # RandomOcclusion
        occlusion_prob: float = 0.0,
        occlusion_px_range: Tuple[int, int] = (50, 100),
        # RandomErase
        erase_prob: float = 0.0,
        erase_px_range: Tuple[int, int] = (50, 100),
        erase_num_repeats: int = 1,
    ) -> None:

        if scaling_type not in ["linear", "exponential"]:
            raise ValueError(f"Unknown scaling type: {scaling_type}. Available types: linear, exponential")

        super().__init__()
        transforms = [T.ToTensor()]

        # when fixing size across multiple datasets, we ensure
        # that the same size is used for all datasets when cropping
        if resize_size is not None:
            transforms.append(T.Resize(resize_size, interpolation_type=resize_interpolation_type))

        if gpu_transforms:
            transforms.append(T.ToGPU())

        # color handling
        color_transforms = [
            T.AsymmetricColorJitter(
                brightness=brightness, contrast=contrast, saturation=saturation, hue=hue, p=asymmetric_jitter_prob
            ),
            T.AsymetricGammaAdjust(p=asymmetric_jitter_prob, gamma_range=gamma_range),
        ]

        if use_grayscale:
            color_transforms.append(T.ConvertToGrayscale())

        transforms.extend(color_transforms)

        transforms.extend(
            [
                T.RandomSpatialShift(
                    p=spatial_shift_prob,
                    max_angle=spatial_shift_max_angle,
                    max_px_shift=spatial_shift_max_displacement,
                    interpolation_type=spatial_shift_interpolation_type,
                ),
                T.ConvertImageDtype(torch.float32),
                T.RandomRescaleAndCrop(
                    crop_size=crop_size,
                    scale_range=scale_range,
                    rescale_prob=rescale_prob,
                    scaling_type=scaling_type,
                    interpolation_type=scale_interpolation_type,
                ),
                T.RandomHorizontalFlip(horizontal_flip_prob),
                # occlusion after flip, otherwise we're occluding the reference image
                T.RandomOcclusion(p=occlusion_prob, occlusion_px_range=occlusion_px_range),
                T.RandomErase(p=erase_prob, erase_px_range=erase_px_range, max_erase=erase_num_repeats),
                T.Normalize(mean=mean, std=std),
                T.MakeValidDisparityMask(max_disparity),
                T.ValidateModelInput(),
            ]
        )

        self.transforms = T.Compose(transforms)

    def forward(self, images, disparties, mask):
        return self.transforms(images, disparties, mask)