prithvi_geospatial_mae.py 11.6 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4
5
6
7
8
9
import argparse
import datetime
import os

import albumentations
import numpy as np
import rasterio
10
import regex as re
11
12
13
14
15
16
import torch
from einops import rearrange
from terratorch.datamodules import Sen1Floods11NonGeoDataModule

from vllm import LLM

17
18
torch.set_default_dtype(torch.float16)

19
20
21
22
23
24
NO_DATA = -9999
NO_DATA_FLOAT = 0.0001
OFFSET = 0
PERCENTILE = 99

datamodule_config = {
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
    "bands": ["BLUE", "GREEN", "RED", "NIR_NARROW", "SWIR_1", "SWIR_2"],
    "batch_size": 16,
    "constant_scale": 0.0001,
    "data_root": "/dccstor/geofm-finetuning/datasets/sen1floods11",
    "drop_last": True,
    "no_data_replace": 0.0,
    "no_label_replace": -1,
    "num_workers": 8,
    "test_transform": [
        albumentations.Resize(
            always_apply=False, height=448, interpolation=1, p=1, width=448
        ),
        albumentations.pytorch.ToTensorV2(
            transpose_mask=False, always_apply=True, p=1.0
        ),
40
41
42
43
44
    ],
}


class PrithviMAE:
45
46
    def __init__(self, model):
        self.model = LLM(
47
48
49
50
51
            model=model,
            skip_tokenizer_init=True,
            dtype="float16",
            enforce_eager=True,
            model_impl="terratorch",
52
        )
53
54
55

    def run(self, input_data, location_coords):
        # merge the inputs into one data structure
56
57
58
59
        if input_data is not None and input_data.dtype == torch.float32:
            input_data = input_data.to(torch.float16)
            input_data = input_data[0]

60
        mm_data = {
61
62
            "pixel_values": input_data,
            "location_coords": location_coords,
63
64
65
        }

        prompt = {"prompt_token_ids": [1], "multi_modal_data": mm_data}
66
        outputs = self.model.encode(prompt, use_tqdm=False)
67
68
69
70
71
72

        return outputs[0].outputs.data


def generate_datamodule():
    datamodule = Sen1Floods11NonGeoDataModule(
73
        data_root=datamodule_config["data_root"],
74
75
76
77
        batch_size=datamodule_config["batch_size"],
        num_workers=datamodule_config["num_workers"],
        bands=datamodule_config["bands"],
        drop_last=datamodule_config["drop_last"],
78
79
        test_transform=datamodule_config["test_transform"],
    )
80
81
82
83
84
85
86
87

    return datamodule


def process_channel_group(orig_img, channels):
    """
    Args:
        orig_img: torch.Tensor representing original image (reference)
88
        with shape = (bands, H, W).
89
90
91
        channels: list of indices representing RGB channels.

    Returns:
92
93
        torch.Tensor with shape (num_channels, height, width)
        for original image
94
95
96
97
98
99
100
101
102
103
    """

    orig_img = orig_img[channels, ...]
    valid_mask = torch.ones_like(orig_img, dtype=torch.bool)
    valid_mask[orig_img == NO_DATA_FLOAT] = False

    # Rescale (enhancing contrast)
    max_value = max(3000, np.percentile(orig_img[valid_mask], PERCENTILE))
    min_value = OFFSET

104
    orig_img = torch.clamp((orig_img - min_value) / (max_value - min_value), 0, 1)
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158

    # No data as zeros
    orig_img[~valid_mask] = 0

    return orig_img


def read_geotiff(file_path: str):
    """Read all bands from *file_path* and return image + meta info.

    Args:
        file_path: path to image file.

    Returns:
        np.ndarray with shape (bands, height, width)
        meta info dict
    """

    with rasterio.open(file_path) as src:
        img = src.read()
        meta = src.meta
        try:
            coords = src.lnglat()
        except Exception:
            # Cannot read coords
            coords = None

    return img, meta, coords


def save_geotiff(image, output_path: str, meta: dict):
    """Save multi-band image in Geotiff file.

    Args:
        image: np.ndarray with shape (bands, height, width)
        output_path: path where to save the image
        meta: dict with meta info.
    """

    with rasterio.open(output_path, "w", **meta) as dest:
        for i in range(image.shape[0]):
            dest.write(image[i, :, :], i + 1)

    return


def _convert_np_uint8(float_image: torch.Tensor):
    image = float_image.numpy() * 255.0
    image = image.astype(dtype=np.uint8)

    return image


def load_example(
159
160
161
    file_paths: list[str],
    mean: list[float] = None,
    std: list[float] = None,
162
    indices: list[int] | None = None,
163
164
165
166
167
):
    """Build an input example by loading images in *file_paths*.

    Args:
        file_paths: list of file paths .
168
169
170
171
        mean: list containing mean values for each band in the
              images in *file_paths*.
        std: list containing std values for each band in the
             images in *file_paths*.
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198

    Returns:
        np.array containing created example
        list of meta info for each image in *file_paths*
    """

    imgs = []
    metas = []
    temporal_coords = []
    location_coords = []

    for file in file_paths:
        img, meta, coords = read_geotiff(file)

        # Rescaling (don't normalize on nodata)
        img = np.moveaxis(img, 0, -1)  # channels last for rescaling
        if indices is not None:
            img = img[..., indices]
        if mean is not None and std is not None:
            img = np.where(img == NO_DATA, NO_DATA_FLOAT, (img - mean) / std)

        imgs.append(img)
        metas.append(meta)
        if coords is not None:
            location_coords.append(coords)

        try:
199
            match = re.search(r"(\d{7,8}T\d{6})", file)
200
201
            if match:
                year = int(match.group(1)[:4])
202
                julian_day = match.group(1).split("T")[0][4:]
203
204
205
                if len(julian_day) == 3:
                    julian_day = int(julian_day)
                else:
206
207
208
209
210
                    julian_day = (
                        datetime.datetime.strptime(julian_day, "%m%d")
                        .timetuple()
                        .tm_yday
                    )
211
212
                temporal_coords.append([year, julian_day])
        except Exception as e:
213
            print(f"Could not extract timestamp for {file} ({e})")
214
215

    imgs = np.stack(imgs, axis=0)  # num_frames, H, W, C
216
    imgs = np.moveaxis(imgs, -1, 0).astype("float32")  # C, num_frames, H, W
217
218
219
220
221
    imgs = np.expand_dims(imgs, axis=0)  # add batch di

    return imgs, temporal_coords, location_coords, metas


222
223
224
225
226
227
228
229
230
def run_model(
    input_data,
    temporal_coords,
    location_coords,
    model,
    datamodule,
    img_size,
    lightning_model=None,
):
231
232
233
234
    # Reflect pad if not divisible by img_size
    original_h, original_w = input_data.shape[-2:]
    pad_h = (img_size - (original_h % img_size)) % img_size
    pad_w = (img_size - (original_w % img_size)) % img_size
235
236
237
    input_data = np.pad(
        input_data, ((0, 0), (0, 0), (0, 0), (0, pad_h), (0, pad_w)), mode="reflect"
    )
238
239

    # Build sliding window
240

241
    batch_size = 1
242
243
    # batch = torch.tensor(input_data, device="cpu")
    batch = torch.tensor(input_data)
244
    windows = batch.unfold(3, img_size, img_size).unfold(4, img_size, img_size)
245
    h1, w1 = windows.shape[3:5]
246
247
248
    windows = rearrange(
        windows, "b c t h1 w1 h w -> (b h1 w1) c t h w", h=img_size, w=img_size
    )
249
250

    # Split into batches if number of windows > batch_size
251
    num_batches = windows.shape[0] // batch_size if windows.shape[0] > batch_size else 1
252
253
254
    windows = torch.tensor_split(windows, num_batches, dim=0)

    if temporal_coords:
255
        temporal_coords = torch.tensor(temporal_coords).unsqueeze(0)
256
257
258
    else:
        temporal_coords = None
    if location_coords:
259
        location_coords = torch.tensor(location_coords[0]).unsqueeze(0)
260
261
262
    else:
        location_coords = None

263
    # Run Prithvi-EO-V2-300M-TL-Sen1Floods11
264
265
266
    pred_imgs = []
    for x in windows:
        # Apply standardization
267
268
        x = datamodule.test_transform(image=x.squeeze().numpy().transpose(1, 2, 0))
        x = datamodule.aug(x)["image"]
269
270
271
272
273

        with torch.no_grad():
            pred = model.run(x, location_coords=location_coords)
        y_hat = pred.argmax(dim=1)

274
275
276
        y_hat = torch.nn.functional.interpolate(
            y_hat.unsqueeze(1).float(), size=img_size, mode="nearest"
        )
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304

        pred_imgs.append(y_hat)

    pred_imgs = torch.concat(pred_imgs, dim=0)

    # Build images from patches
    pred_imgs = rearrange(
        pred_imgs,
        "(b h1 w1) c h w -> b c (h1 h) (w1 w)",
        h=img_size,
        w=img_size,
        b=1,
        c=1,
        h1=h1,
        w1=w1,
    )

    # Cut padded area back to original size
    pred_imgs = pred_imgs[..., :original_h, :original_w]

    # Squeeze (batch size 1)
    pred_imgs = pred_imgs[0]

    return pred_imgs


def main(
    data_file: str,
305
    model: str,
306
307
308
309
310
311
    output_dir: str,
    rgb_outputs: bool,
    input_indices: list[int] = None,
):
    os.makedirs(output_dir, exist_ok=True)

312
    model_obj = PrithviMAE(model=model)
313
    datamodule = generate_datamodule()
314
    img_size = 512  # Size of Sen1Floods11
315
316
317
318
319
320
321
322
323
324
325
326

    input_data, temporal_coords, location_coords, meta_data = load_example(
        file_paths=[data_file],
        indices=input_indices,
    )

    meta_data = meta_data[0]  # only one image

    if input_data.mean() > 1:
        input_data = input_data / 10000  # Convert to range 0-1

    channels = [
327
        datamodule_config["bands"].index(b) for b in ["RED", "GREEN", "BLUE"]
328
329
    ]  # BGR -> RGB

330
331
332
    pred = run_model(
        input_data, temporal_coords, location_coords, model_obj, datamodule, img_size
    )
333
334
335
    # Save pred
    meta_data.update(count=1, dtype="uint8", compress="lzw", nodata=0)
    pred_file = os.path.join(
336
337
        output_dir, f"pred_{os.path.splitext(os.path.basename(data_file))[0]}.tiff"
    )
338
339
340
341
342
343
344
345
346
347
348
349
    save_geotiff(_convert_np_uint8(pred), pred_file, meta_data)

    # Save image + pred
    meta_data.update(count=3, dtype="uint8", compress="lzw", nodata=0)

    if input_data.mean() < 1:
        input_data = input_data * 10000  # Scale to 0-10000

    rgb_orig = process_channel_group(
        orig_img=torch.Tensor(input_data[0, :, 0, ...]),
        channels=channels,
    )
350
    rgb_orig = rgb_orig.to(torch.float32)
351

352
    pred[pred == 0.0] = np.nan
353
354
355
356
    img_pred = rgb_orig * 0.7 + pred * 0.3
    img_pred[img_pred.isnan()] = rgb_orig[img_pred.isnan()]

    img_pred_file = os.path.join(
357
358
        output_dir, f"rgb_pred_{os.path.splitext(os.path.basename(data_file))[0]}.tiff"
    )
359
360
361
362
363
364
365
366
    save_geotiff(
        image=_convert_np_uint8(img_pred),
        output_path=img_pred_file,
        meta=meta_data,
    )

    # Save image rgb
    if rgb_outputs:
367
        name_suffix = os.path.splitext(os.path.basename(data_file))[0]
368
        rgb_file = os.path.join(
369
            output_dir,
370
            f"original_rgb_{name_suffix}.tiff",
371
        )
372
373
374
375
376
377
378
379
        save_geotiff(
            image=_convert_np_uint8(rgb_orig),
            output_path=rgb_file,
            meta=meta_data,
        )


if __name__ == "__main__":
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
    parser = argparse.ArgumentParser("MAE run inference", add_help=False)

    parser.add_argument(
        "--data_file",
        type=str,
        default="./India_900498_S2Hand.tif",
        help="Path to the file.",
    )
    parser.add_argument(
        "--model",
        type=str,
        default="christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM",
        help="Path to a checkpoint file to load from.",
    )
    parser.add_argument(
        "--output_dir",
        type=str,
        default="output",
        help="Path to the directory where to save outputs.",
    )
    parser.add_argument(
        "--input_indices",
        default=[1, 2, 3, 8, 11, 12],
        type=int,
        nargs="+",
        help="""
        0-based indices of the six Prithvi channels to be selected from the input.
        By default selects [1,2,3,8,11,12] for S2L1C data.
        """,
    )
    parser.add_argument(
        "--rgb_outputs",
        action="store_true",
        help="If present, output files will only contain RGB channels. "
        "Otherwise, all bands will be saved.",
    )
    args = parser.parse_args()
417
418

    main(**vars(args))