prithvi_geospatial_mae.py 11.6 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4
5
import argparse
import datetime
import os
6
import re
7
from typing import Union
8
9
10
11
12
13
14
15
16
17

import albumentations
import numpy as np
import rasterio
import torch
from einops import rearrange
from terratorch.datamodules import Sen1Floods11NonGeoDataModule

from vllm import LLM

18
19
torch.set_default_dtype(torch.float16)

20
21
22
23
24
25
NO_DATA = -9999
NO_DATA_FLOAT = 0.0001
OFFSET = 0
PERCENTILE = 99

datamodule_config = {
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
    "bands": ["BLUE", "GREEN", "RED", "NIR_NARROW", "SWIR_1", "SWIR_2"],
    "batch_size": 16,
    "constant_scale": 0.0001,
    "data_root": "/dccstor/geofm-finetuning/datasets/sen1floods11",
    "drop_last": True,
    "no_data_replace": 0.0,
    "no_label_replace": -1,
    "num_workers": 8,
    "test_transform": [
        albumentations.Resize(
            always_apply=False, height=448, interpolation=1, p=1, width=448
        ),
        albumentations.pytorch.ToTensorV2(
            transpose_mask=False, always_apply=True, p=1.0
        ),
41
42
43
44
45
    ],
}


class PrithviMAE:
46
47
48
    def __init__(self, model):
        self.model = LLM(
            model=model, skip_tokenizer_init=True, dtype="float16", enforce_eager=True
49
        )
50
51
52

    def run(self, input_data, location_coords):
        # merge the inputs into one data structure
53
54
55
56
        if input_data is not None and input_data.dtype == torch.float32:
            input_data = input_data.to(torch.float16)
            input_data = input_data[0]

57
        mm_data = {
58
59
            "pixel_values": input_data,
            "location_coords": location_coords,
60
61
62
        }

        prompt = {"prompt_token_ids": [1], "multi_modal_data": mm_data}
63
        outputs = self.model.encode(prompt, use_tqdm=False)
64
65
66
67
68
69

        return outputs[0].outputs.data


def generate_datamodule():
    datamodule = Sen1Floods11NonGeoDataModule(
70
        data_root=datamodule_config["data_root"],
71
72
73
74
        batch_size=datamodule_config["batch_size"],
        num_workers=datamodule_config["num_workers"],
        bands=datamodule_config["bands"],
        drop_last=datamodule_config["drop_last"],
75
76
        test_transform=datamodule_config["test_transform"],
    )
77
78
79
80
81
82
83
84

    return datamodule


def process_channel_group(orig_img, channels):
    """
    Args:
        orig_img: torch.Tensor representing original image (reference)
85
        with shape = (bands, H, W).
86
87
88
        channels: list of indices representing RGB channels.

    Returns:
89
90
        torch.Tensor with shape (num_channels, height, width)
        for original image
91
92
93
94
95
96
97
98
99
100
    """

    orig_img = orig_img[channels, ...]
    valid_mask = torch.ones_like(orig_img, dtype=torch.bool)
    valid_mask[orig_img == NO_DATA_FLOAT] = False

    # Rescale (enhancing contrast)
    max_value = max(3000, np.percentile(orig_img[valid_mask], PERCENTILE))
    min_value = OFFSET

101
    orig_img = torch.clamp((orig_img - min_value) / (max_value - min_value), 0, 1)
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155

    # No data as zeros
    orig_img[~valid_mask] = 0

    return orig_img


def read_geotiff(file_path: str):
    """Read all bands from *file_path* and return image + meta info.

    Args:
        file_path: path to image file.

    Returns:
        np.ndarray with shape (bands, height, width)
        meta info dict
    """

    with rasterio.open(file_path) as src:
        img = src.read()
        meta = src.meta
        try:
            coords = src.lnglat()
        except Exception:
            # Cannot read coords
            coords = None

    return img, meta, coords


def save_geotiff(image, output_path: str, meta: dict):
    """Save multi-band image in Geotiff file.

    Args:
        image: np.ndarray with shape (bands, height, width)
        output_path: path where to save the image
        meta: dict with meta info.
    """

    with rasterio.open(output_path, "w", **meta) as dest:
        for i in range(image.shape[0]):
            dest.write(image[i, :, :], i + 1)

    return


def _convert_np_uint8(float_image: torch.Tensor):
    image = float_image.numpy() * 255.0
    image = image.astype(dtype=np.uint8)

    return image


def load_example(
156
157
158
    file_paths: list[str],
    mean: list[float] = None,
    std: list[float] = None,
159
160
161
162
163
164
    indices: Union[list[int], None] = None,
):
    """Build an input example by loading images in *file_paths*.

    Args:
        file_paths: list of file paths .
165
166
167
168
        mean: list containing mean values for each band in the
              images in *file_paths*.
        std: list containing std values for each band in the
             images in *file_paths*.
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195

    Returns:
        np.array containing created example
        list of meta info for each image in *file_paths*
    """

    imgs = []
    metas = []
    temporal_coords = []
    location_coords = []

    for file in file_paths:
        img, meta, coords = read_geotiff(file)

        # Rescaling (don't normalize on nodata)
        img = np.moveaxis(img, 0, -1)  # channels last for rescaling
        if indices is not None:
            img = img[..., indices]
        if mean is not None and std is not None:
            img = np.where(img == NO_DATA, NO_DATA_FLOAT, (img - mean) / std)

        imgs.append(img)
        metas.append(meta)
        if coords is not None:
            location_coords.append(coords)

        try:
196
            match = re.search(r"(\d{7,8}T\d{6})", file)
197
198
            if match:
                year = int(match.group(1)[:4])
199
                julian_day = match.group(1).split("T")[0][4:]
200
201
202
                if len(julian_day) == 3:
                    julian_day = int(julian_day)
                else:
203
204
205
206
207
                    julian_day = (
                        datetime.datetime.strptime(julian_day, "%m%d")
                        .timetuple()
                        .tm_yday
                    )
208
209
                temporal_coords.append([year, julian_day])
        except Exception as e:
210
            print(f"Could not extract timestamp for {file} ({e})")
211
212

    imgs = np.stack(imgs, axis=0)  # num_frames, H, W, C
213
    imgs = np.moveaxis(imgs, -1, 0).astype("float32")  # C, num_frames, H, W
214
215
216
217
218
    imgs = np.expand_dims(imgs, axis=0)  # add batch di

    return imgs, temporal_coords, location_coords, metas


219
220
221
222
223
224
225
226
227
def run_model(
    input_data,
    temporal_coords,
    location_coords,
    model,
    datamodule,
    img_size,
    lightning_model=None,
):
228
229
230
231
    # Reflect pad if not divisible by img_size
    original_h, original_w = input_data.shape[-2:]
    pad_h = (img_size - (original_h % img_size)) % img_size
    pad_w = (img_size - (original_w % img_size)) % img_size
232
233
234
    input_data = np.pad(
        input_data, ((0, 0), (0, 0), (0, 0), (0, pad_h), (0, pad_w)), mode="reflect"
    )
235
236

    # Build sliding window
237

238
    batch_size = 1
239
240
    # batch = torch.tensor(input_data, device="cpu")
    batch = torch.tensor(input_data)
241
    windows = batch.unfold(3, img_size, img_size).unfold(4, img_size, img_size)
242
    h1, w1 = windows.shape[3:5]
243
244
245
    windows = rearrange(
        windows, "b c t h1 w1 h w -> (b h1 w1) c t h w", h=img_size, w=img_size
    )
246
247

    # Split into batches if number of windows > batch_size
248
    num_batches = windows.shape[0] // batch_size if windows.shape[0] > batch_size else 1
249
250
251
    windows = torch.tensor_split(windows, num_batches, dim=0)

    if temporal_coords:
252
        temporal_coords = torch.tensor(temporal_coords).unsqueeze(0)
253
254
255
    else:
        temporal_coords = None
    if location_coords:
256
        location_coords = torch.tensor(location_coords[0]).unsqueeze(0)
257
258
259
    else:
        location_coords = None

260
    # Run Prithvi-EO-V2-300M-TL-Sen1Floods11
261
262
263
    pred_imgs = []
    for x in windows:
        # Apply standardization
264
265
        x = datamodule.test_transform(image=x.squeeze().numpy().transpose(1, 2, 0))
        x = datamodule.aug(x)["image"]
266
267
268
269
270

        with torch.no_grad():
            pred = model.run(x, location_coords=location_coords)
        y_hat = pred.argmax(dim=1)

271
272
273
        y_hat = torch.nn.functional.interpolate(
            y_hat.unsqueeze(1).float(), size=img_size, mode="nearest"
        )
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301

        pred_imgs.append(y_hat)

    pred_imgs = torch.concat(pred_imgs, dim=0)

    # Build images from patches
    pred_imgs = rearrange(
        pred_imgs,
        "(b h1 w1) c h w -> b c (h1 h) (w1 w)",
        h=img_size,
        w=img_size,
        b=1,
        c=1,
        h1=h1,
        w1=w1,
    )

    # Cut padded area back to original size
    pred_imgs = pred_imgs[..., :original_h, :original_w]

    # Squeeze (batch size 1)
    pred_imgs = pred_imgs[0]

    return pred_imgs


def main(
    data_file: str,
302
    model: str,
303
304
305
306
307
308
    output_dir: str,
    rgb_outputs: bool,
    input_indices: list[int] = None,
):
    os.makedirs(output_dir, exist_ok=True)

309
    model_obj = PrithviMAE(model=model)
310
    datamodule = generate_datamodule()
311
    img_size = 512  # Size of Sen1Floods11
312
313
314
315
316
317
318
319
320
321
322
323

    input_data, temporal_coords, location_coords, meta_data = load_example(
        file_paths=[data_file],
        indices=input_indices,
    )

    meta_data = meta_data[0]  # only one image

    if input_data.mean() > 1:
        input_data = input_data / 10000  # Convert to range 0-1

    channels = [
324
        datamodule_config["bands"].index(b) for b in ["RED", "GREEN", "BLUE"]
325
326
    ]  # BGR -> RGB

327
328
329
    pred = run_model(
        input_data, temporal_coords, location_coords, model_obj, datamodule, img_size
    )
330
331
332
    # Save pred
    meta_data.update(count=1, dtype="uint8", compress="lzw", nodata=0)
    pred_file = os.path.join(
333
334
        output_dir, f"pred_{os.path.splitext(os.path.basename(data_file))[0]}.tiff"
    )
335
336
337
338
339
340
341
342
343
344
345
346
    save_geotiff(_convert_np_uint8(pred), pred_file, meta_data)

    # Save image + pred
    meta_data.update(count=3, dtype="uint8", compress="lzw", nodata=0)

    if input_data.mean() < 1:
        input_data = input_data * 10000  # Scale to 0-10000

    rgb_orig = process_channel_group(
        orig_img=torch.Tensor(input_data[0, :, 0, ...]),
        channels=channels,
    )
347
    rgb_orig = rgb_orig.to(torch.float32)
348

349
    pred[pred == 0.0] = np.nan
350
351
352
353
    img_pred = rgb_orig * 0.7 + pred * 0.3
    img_pred[img_pred.isnan()] = rgb_orig[img_pred.isnan()]

    img_pred_file = os.path.join(
354
355
        output_dir, f"rgb_pred_{os.path.splitext(os.path.basename(data_file))[0]}.tiff"
    )
356
357
358
359
360
361
362
363
    save_geotiff(
        image=_convert_np_uint8(img_pred),
        output_path=img_pred_file,
        meta=meta_data,
    )

    # Save image rgb
    if rgb_outputs:
364
        name_suffix = os.path.splitext(os.path.basename(data_file))[0]
365
        rgb_file = os.path.join(
366
            output_dir,
367
            f"original_rgb_{name_suffix}.tiff",
368
        )
369
370
371
372
373
374
375
376
        save_geotiff(
            image=_convert_np_uint8(rgb_orig),
            output_path=rgb_file,
            meta=meta_data,
        )


if __name__ == "__main__":
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
    parser = argparse.ArgumentParser("MAE run inference", add_help=False)

    parser.add_argument(
        "--data_file",
        type=str,
        default="./India_900498_S2Hand.tif",
        help="Path to the file.",
    )
    parser.add_argument(
        "--model",
        type=str,
        default="christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM",
        help="Path to a checkpoint file to load from.",
    )
    parser.add_argument(
        "--output_dir",
        type=str,
        default="output",
        help="Path to the directory where to save outputs.",
    )
    parser.add_argument(
        "--input_indices",
        default=[1, 2, 3, 8, 11, 12],
        type=int,
        nargs="+",
        help="""
        0-based indices of the six Prithvi channels to be selected from the input.
        By default selects [1,2,3,8,11,12] for S2L1C data.
        """,
    )
    parser.add_argument(
        "--rgb_outputs",
        action="store_true",
        help="If present, output files will only contain RGB channels. "
        "Otherwise, all bands will be saved.",
    )
    args = parser.parse_args()
414
415

    main(**vars(args))