prithvi_geospatial_mae.py 11.7 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4
5
import argparse
import datetime
import os
6
from typing import Union
7
8
9
10

import albumentations
import numpy as np
import rasterio
11
import regex as re
12
13
14
15
16
17
import torch
from einops import rearrange
from terratorch.datamodules import Sen1Floods11NonGeoDataModule

from vllm import LLM

18
19
torch.set_default_dtype(torch.float16)

20
21
22
23
24
25
NO_DATA = -9999
NO_DATA_FLOAT = 0.0001
OFFSET = 0
PERCENTILE = 99

datamodule_config = {
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
    "bands": ["BLUE", "GREEN", "RED", "NIR_NARROW", "SWIR_1", "SWIR_2"],
    "batch_size": 16,
    "constant_scale": 0.0001,
    "data_root": "/dccstor/geofm-finetuning/datasets/sen1floods11",
    "drop_last": True,
    "no_data_replace": 0.0,
    "no_label_replace": -1,
    "num_workers": 8,
    "test_transform": [
        albumentations.Resize(
            always_apply=False, height=448, interpolation=1, p=1, width=448
        ),
        albumentations.pytorch.ToTensorV2(
            transpose_mask=False, always_apply=True, p=1.0
        ),
41
42
43
44
45
    ],
}


class PrithviMAE:
46
47
    def __init__(self, model):
        self.model = LLM(
48
49
50
51
52
            model=model,
            skip_tokenizer_init=True,
            dtype="float16",
            enforce_eager=True,
            model_impl="terratorch",
53
            enable_mm_embeds=True,
54
        )
55
56
57

    def run(self, input_data, location_coords):
        # merge the inputs into one data structure
58
59
60
61
        if input_data is not None and input_data.dtype == torch.float32:
            input_data = input_data.to(torch.float16)
            input_data = input_data[0]

62
        mm_data = {
63
64
            "pixel_values": input_data,
            "location_coords": location_coords,
65
66
67
        }

        prompt = {"prompt_token_ids": [1], "multi_modal_data": mm_data}
68
        outputs = self.model.encode(prompt, use_tqdm=False)
69
70
71
72
73
74

        return outputs[0].outputs.data


def generate_datamodule():
    datamodule = Sen1Floods11NonGeoDataModule(
75
        data_root=datamodule_config["data_root"],
76
77
78
79
        batch_size=datamodule_config["batch_size"],
        num_workers=datamodule_config["num_workers"],
        bands=datamodule_config["bands"],
        drop_last=datamodule_config["drop_last"],
80
81
        test_transform=datamodule_config["test_transform"],
    )
82
83
84
85
86
87
88
89

    return datamodule


def process_channel_group(orig_img, channels):
    """
    Args:
        orig_img: torch.Tensor representing original image (reference)
90
        with shape = (bands, H, W).
91
92
93
        channels: list of indices representing RGB channels.

    Returns:
94
95
        torch.Tensor with shape (num_channels, height, width)
        for original image
96
97
98
99
100
101
102
103
104
105
    """

    orig_img = orig_img[channels, ...]
    valid_mask = torch.ones_like(orig_img, dtype=torch.bool)
    valid_mask[orig_img == NO_DATA_FLOAT] = False

    # Rescale (enhancing contrast)
    max_value = max(3000, np.percentile(orig_img[valid_mask], PERCENTILE))
    min_value = OFFSET

106
    orig_img = torch.clamp((orig_img - min_value) / (max_value - min_value), 0, 1)
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160

    # No data as zeros
    orig_img[~valid_mask] = 0

    return orig_img


def read_geotiff(file_path: str):
    """Read all bands from *file_path* and return image + meta info.

    Args:
        file_path: path to image file.

    Returns:
        np.ndarray with shape (bands, height, width)
        meta info dict
    """

    with rasterio.open(file_path) as src:
        img = src.read()
        meta = src.meta
        try:
            coords = src.lnglat()
        except Exception:
            # Cannot read coords
            coords = None

    return img, meta, coords


def save_geotiff(image, output_path: str, meta: dict):
    """Save multi-band image in Geotiff file.

    Args:
        image: np.ndarray with shape (bands, height, width)
        output_path: path where to save the image
        meta: dict with meta info.
    """

    with rasterio.open(output_path, "w", **meta) as dest:
        for i in range(image.shape[0]):
            dest.write(image[i, :, :], i + 1)

    return


def _convert_np_uint8(float_image: torch.Tensor):
    image = float_image.numpy() * 255.0
    image = image.astype(dtype=np.uint8)

    return image


def load_example(
161
162
163
    file_paths: list[str],
    mean: list[float] = None,
    std: list[float] = None,
164
165
166
167
168
169
    indices: Union[list[int], None] = None,
):
    """Build an input example by loading images in *file_paths*.

    Args:
        file_paths: list of file paths .
170
171
172
173
        mean: list containing mean values for each band in the
              images in *file_paths*.
        std: list containing std values for each band in the
             images in *file_paths*.
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200

    Returns:
        np.array containing created example
        list of meta info for each image in *file_paths*
    """

    imgs = []
    metas = []
    temporal_coords = []
    location_coords = []

    for file in file_paths:
        img, meta, coords = read_geotiff(file)

        # Rescaling (don't normalize on nodata)
        img = np.moveaxis(img, 0, -1)  # channels last for rescaling
        if indices is not None:
            img = img[..., indices]
        if mean is not None and std is not None:
            img = np.where(img == NO_DATA, NO_DATA_FLOAT, (img - mean) / std)

        imgs.append(img)
        metas.append(meta)
        if coords is not None:
            location_coords.append(coords)

        try:
201
            match = re.search(r"(\d{7,8}T\d{6})", file)
202
203
            if match:
                year = int(match.group(1)[:4])
204
                julian_day = match.group(1).split("T")[0][4:]
205
206
207
                if len(julian_day) == 3:
                    julian_day = int(julian_day)
                else:
208
209
210
211
212
                    julian_day = (
                        datetime.datetime.strptime(julian_day, "%m%d")
                        .timetuple()
                        .tm_yday
                    )
213
214
                temporal_coords.append([year, julian_day])
        except Exception as e:
215
            print(f"Could not extract timestamp for {file} ({e})")
216
217

    imgs = np.stack(imgs, axis=0)  # num_frames, H, W, C
218
    imgs = np.moveaxis(imgs, -1, 0).astype("float32")  # C, num_frames, H, W
219
220
221
222
223
    imgs = np.expand_dims(imgs, axis=0)  # add batch di

    return imgs, temporal_coords, location_coords, metas


224
225
226
227
228
229
230
231
232
def run_model(
    input_data,
    temporal_coords,
    location_coords,
    model,
    datamodule,
    img_size,
    lightning_model=None,
):
233
234
235
236
    # Reflect pad if not divisible by img_size
    original_h, original_w = input_data.shape[-2:]
    pad_h = (img_size - (original_h % img_size)) % img_size
    pad_w = (img_size - (original_w % img_size)) % img_size
237
238
239
    input_data = np.pad(
        input_data, ((0, 0), (0, 0), (0, 0), (0, pad_h), (0, pad_w)), mode="reflect"
    )
240
241

    # Build sliding window
242

243
    batch_size = 1
244
245
    # batch = torch.tensor(input_data, device="cpu")
    batch = torch.tensor(input_data)
246
    windows = batch.unfold(3, img_size, img_size).unfold(4, img_size, img_size)
247
    h1, w1 = windows.shape[3:5]
248
249
250
    windows = rearrange(
        windows, "b c t h1 w1 h w -> (b h1 w1) c t h w", h=img_size, w=img_size
    )
251
252

    # Split into batches if number of windows > batch_size
253
    num_batches = windows.shape[0] // batch_size if windows.shape[0] > batch_size else 1
254
255
256
    windows = torch.tensor_split(windows, num_batches, dim=0)

    if temporal_coords:
257
        temporal_coords = torch.tensor(temporal_coords).unsqueeze(0)
258
259
260
    else:
        temporal_coords = None
    if location_coords:
261
        location_coords = torch.tensor(location_coords[0]).unsqueeze(0)
262
263
264
    else:
        location_coords = None

265
    # Run Prithvi-EO-V2-300M-TL-Sen1Floods11
266
267
268
    pred_imgs = []
    for x in windows:
        # Apply standardization
269
270
        x = datamodule.test_transform(image=x.squeeze().numpy().transpose(1, 2, 0))
        x = datamodule.aug(x)["image"]
271
272
273
274
275

        with torch.no_grad():
            pred = model.run(x, location_coords=location_coords)
        y_hat = pred.argmax(dim=1)

276
277
278
        y_hat = torch.nn.functional.interpolate(
            y_hat.unsqueeze(1).float(), size=img_size, mode="nearest"
        )
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306

        pred_imgs.append(y_hat)

    pred_imgs = torch.concat(pred_imgs, dim=0)

    # Build images from patches
    pred_imgs = rearrange(
        pred_imgs,
        "(b h1 w1) c h w -> b c (h1 h) (w1 w)",
        h=img_size,
        w=img_size,
        b=1,
        c=1,
        h1=h1,
        w1=w1,
    )

    # Cut padded area back to original size
    pred_imgs = pred_imgs[..., :original_h, :original_w]

    # Squeeze (batch size 1)
    pred_imgs = pred_imgs[0]

    return pred_imgs


def main(
    data_file: str,
307
    model: str,
308
309
310
311
312
313
    output_dir: str,
    rgb_outputs: bool,
    input_indices: list[int] = None,
):
    os.makedirs(output_dir, exist_ok=True)

314
    model_obj = PrithviMAE(model=model)
315
    datamodule = generate_datamodule()
316
    img_size = 512  # Size of Sen1Floods11
317
318
319
320
321
322
323
324
325
326
327
328

    input_data, temporal_coords, location_coords, meta_data = load_example(
        file_paths=[data_file],
        indices=input_indices,
    )

    meta_data = meta_data[0]  # only one image

    if input_data.mean() > 1:
        input_data = input_data / 10000  # Convert to range 0-1

    channels = [
329
        datamodule_config["bands"].index(b) for b in ["RED", "GREEN", "BLUE"]
330
331
    ]  # BGR -> RGB

332
333
334
    pred = run_model(
        input_data, temporal_coords, location_coords, model_obj, datamodule, img_size
    )
335
336
337
    # Save pred
    meta_data.update(count=1, dtype="uint8", compress="lzw", nodata=0)
    pred_file = os.path.join(
338
339
        output_dir, f"pred_{os.path.splitext(os.path.basename(data_file))[0]}.tiff"
    )
340
341
342
343
344
345
346
347
348
349
350
351
    save_geotiff(_convert_np_uint8(pred), pred_file, meta_data)

    # Save image + pred
    meta_data.update(count=3, dtype="uint8", compress="lzw", nodata=0)

    if input_data.mean() < 1:
        input_data = input_data * 10000  # Scale to 0-10000

    rgb_orig = process_channel_group(
        orig_img=torch.Tensor(input_data[0, :, 0, ...]),
        channels=channels,
    )
352
    rgb_orig = rgb_orig.to(torch.float32)
353

354
    pred[pred == 0.0] = np.nan
355
356
357
358
    img_pred = rgb_orig * 0.7 + pred * 0.3
    img_pred[img_pred.isnan()] = rgb_orig[img_pred.isnan()]

    img_pred_file = os.path.join(
359
360
        output_dir, f"rgb_pred_{os.path.splitext(os.path.basename(data_file))[0]}.tiff"
    )
361
362
363
364
365
366
367
368
    save_geotiff(
        image=_convert_np_uint8(img_pred),
        output_path=img_pred_file,
        meta=meta_data,
    )

    # Save image rgb
    if rgb_outputs:
369
        name_suffix = os.path.splitext(os.path.basename(data_file))[0]
370
        rgb_file = os.path.join(
371
            output_dir,
372
            f"original_rgb_{name_suffix}.tiff",
373
        )
374
375
376
377
378
379
380
381
        save_geotiff(
            image=_convert_np_uint8(rgb_orig),
            output_path=rgb_file,
            meta=meta_data,
        )


if __name__ == "__main__":
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
    parser = argparse.ArgumentParser("MAE run inference", add_help=False)

    parser.add_argument(
        "--data_file",
        type=str,
        default="./India_900498_S2Hand.tif",
        help="Path to the file.",
    )
    parser.add_argument(
        "--model",
        type=str,
        default="christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM",
        help="Path to a checkpoint file to load from.",
    )
    parser.add_argument(
        "--output_dir",
        type=str,
        default="output",
        help="Path to the directory where to save outputs.",
    )
    parser.add_argument(
        "--input_indices",
        default=[1, 2, 3, 8, 11, 12],
        type=int,
        nargs="+",
        help="""
        0-based indices of the six Prithvi channels to be selected from the input.
        By default selects [1,2,3,8,11,12] for S2L1C data.
        """,
    )
    parser.add_argument(
        "--rgb_outputs",
        action="store_true",
        help="If present, output files will only contain RGB channels. "
        "Otherwise, all bands will be saved.",
    )
    args = parser.parse_args()
419
420

    main(**vars(args))