prithvi_geospatial_mae_offline.py 11.7 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
4
5
6
7
8
9
import argparse
import datetime
import os

import albumentations
import numpy as np
import rasterio
10
import regex as re
11
12
13
14
15
16
import torch
from einops import rearrange
from terratorch.datamodules import Sen1Floods11NonGeoDataModule

from vllm import LLM

17
18
torch.set_default_dtype(torch.float16)

19
20
21
22
23
24
NO_DATA = -9999
NO_DATA_FLOAT = 0.0001
OFFSET = 0
PERCENTILE = 99

datamodule_config = {
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
    "bands": ["BLUE", "GREEN", "RED", "NIR_NARROW", "SWIR_1", "SWIR_2"],
    "batch_size": 16,
    "constant_scale": 0.0001,
    "data_root": "/dccstor/geofm-finetuning/datasets/sen1floods11",
    "drop_last": True,
    "no_data_replace": 0.0,
    "no_label_replace": -1,
    "num_workers": 8,
    "test_transform": [
        albumentations.Resize(
            always_apply=False, height=448, interpolation=1, p=1, width=448
        ),
        albumentations.pytorch.ToTensorV2(
            transpose_mask=False, always_apply=True, p=1.0
        ),
40
41
42
43
44
    ],
}


class PrithviMAE:
45
46
    def __init__(self, model):
        self.model = LLM(
47
48
49
50
51
            model=model,
            skip_tokenizer_init=True,
            dtype="float16",
            enforce_eager=True,
            model_impl="terratorch",
52
            enable_mm_embeds=True,
53
        )
54
55
56

    def run(self, input_data, location_coords):
        # merge the inputs into one data structure
57
58
59
60
        if input_data is not None and input_data.dtype == torch.float32:
            input_data = input_data.to(torch.float16)
            input_data = input_data[0]

61
        mm_data = {
62
63
            "pixel_values": input_data,
            "location_coords": location_coords,
64
65
66
        }

        prompt = {"prompt_token_ids": [1], "multi_modal_data": mm_data}
67
        outputs = self.model.encode(prompt, pooling_task="plugin", use_tqdm=False)
68
69
70
71
72
73

        return outputs[0].outputs.data


def generate_datamodule():
    datamodule = Sen1Floods11NonGeoDataModule(
74
        data_root=datamodule_config["data_root"],
75
76
77
78
        batch_size=datamodule_config["batch_size"],
        num_workers=datamodule_config["num_workers"],
        bands=datamodule_config["bands"],
        drop_last=datamodule_config["drop_last"],
79
80
        test_transform=datamodule_config["test_transform"],
    )
81
82
83
84
85
86
87
88

    return datamodule


def process_channel_group(orig_img, channels):
    """
    Args:
        orig_img: torch.Tensor representing original image (reference)
89
        with shape = (bands, H, W).
90
91
92
        channels: list of indices representing RGB channels.

    Returns:
93
94
        torch.Tensor with shape (num_channels, height, width)
        for original image
95
96
97
98
99
100
101
102
103
104
    """

    orig_img = orig_img[channels, ...]
    valid_mask = torch.ones_like(orig_img, dtype=torch.bool)
    valid_mask[orig_img == NO_DATA_FLOAT] = False

    # Rescale (enhancing contrast)
    max_value = max(3000, np.percentile(orig_img[valid_mask], PERCENTILE))
    min_value = OFFSET

105
    orig_img = torch.clamp((orig_img - min_value) / (max_value - min_value), 0, 1)
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159

    # No data as zeros
    orig_img[~valid_mask] = 0

    return orig_img


def read_geotiff(file_path: str):
    """Read all bands from *file_path* and return image + meta info.

    Args:
        file_path: path to image file.

    Returns:
        np.ndarray with shape (bands, height, width)
        meta info dict
    """

    with rasterio.open(file_path) as src:
        img = src.read()
        meta = src.meta
        try:
            coords = src.lnglat()
        except Exception:
            # Cannot read coords
            coords = None

    return img, meta, coords


def save_geotiff(image, output_path: str, meta: dict):
    """Save multi-band image in Geotiff file.

    Args:
        image: np.ndarray with shape (bands, height, width)
        output_path: path where to save the image
        meta: dict with meta info.
    """

    with rasterio.open(output_path, "w", **meta) as dest:
        for i in range(image.shape[0]):
            dest.write(image[i, :, :], i + 1)

    return


def _convert_np_uint8(float_image: torch.Tensor):
    image = float_image.numpy() * 255.0
    image = image.astype(dtype=np.uint8)

    return image


def load_example(
160
161
162
    file_paths: list[str],
    mean: list[float] = None,
    std: list[float] = None,
163
    indices: list[int] | None = None,
164
165
166
167
168
):
    """Build an input example by loading images in *file_paths*.

    Args:
        file_paths: list of file paths .
169
170
171
172
        mean: list containing mean values for each band in the
              images in *file_paths*.
        std: list containing std values for each band in the
             images in *file_paths*.
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199

    Returns:
        np.array containing created example
        list of meta info for each image in *file_paths*
    """

    imgs = []
    metas = []
    temporal_coords = []
    location_coords = []

    for file in file_paths:
        img, meta, coords = read_geotiff(file)

        # Rescaling (don't normalize on nodata)
        img = np.moveaxis(img, 0, -1)  # channels last for rescaling
        if indices is not None:
            img = img[..., indices]
        if mean is not None and std is not None:
            img = np.where(img == NO_DATA, NO_DATA_FLOAT, (img - mean) / std)

        imgs.append(img)
        metas.append(meta)
        if coords is not None:
            location_coords.append(coords)

        try:
200
            match = re.search(r"(\d{7,8}T\d{6})", file)
201
202
            if match:
                year = int(match.group(1)[:4])
203
                julian_day = match.group(1).split("T")[0][4:]
204
205
206
                if len(julian_day) == 3:
                    julian_day = int(julian_day)
                else:
207
208
209
210
211
                    julian_day = (
                        datetime.datetime.strptime(julian_day, "%m%d")
                        .timetuple()
                        .tm_yday
                    )
212
213
                temporal_coords.append([year, julian_day])
        except Exception as e:
214
            print(f"Could not extract timestamp for {file} ({e})")
215
216

    imgs = np.stack(imgs, axis=0)  # num_frames, H, W, C
217
    imgs = np.moveaxis(imgs, -1, 0).astype("float32")  # C, num_frames, H, W
218
219
220
221
222
    imgs = np.expand_dims(imgs, axis=0)  # add batch di

    return imgs, temporal_coords, location_coords, metas


223
224
225
226
227
228
229
230
231
def run_model(
    input_data,
    temporal_coords,
    location_coords,
    model,
    datamodule,
    img_size,
    lightning_model=None,
):
232
233
234
235
    # Reflect pad if not divisible by img_size
    original_h, original_w = input_data.shape[-2:]
    pad_h = (img_size - (original_h % img_size)) % img_size
    pad_w = (img_size - (original_w % img_size)) % img_size
236
237
238
    input_data = np.pad(
        input_data, ((0, 0), (0, 0), (0, 0), (0, pad_h), (0, pad_w)), mode="reflect"
    )
239
240

    # Build sliding window
241

242
    batch_size = 1
243
244
    # batch = torch.tensor(input_data, device="cpu")
    batch = torch.tensor(input_data)
245
    windows = batch.unfold(3, img_size, img_size).unfold(4, img_size, img_size)
246
    h1, w1 = windows.shape[3:5]
247
248
249
    windows = rearrange(
        windows, "b c t h1 w1 h w -> (b h1 w1) c t h w", h=img_size, w=img_size
    )
250
251

    # Split into batches if number of windows > batch_size
252
    num_batches = windows.shape[0] // batch_size if windows.shape[0] > batch_size else 1
253
254
255
    windows = torch.tensor_split(windows, num_batches, dim=0)

    if temporal_coords:
256
        temporal_coords = torch.tensor(temporal_coords).unsqueeze(0)
257
258
259
    else:
        temporal_coords = None
    if location_coords:
260
        location_coords = torch.tensor(location_coords[0]).unsqueeze(0)
261
262
263
    else:
        location_coords = None

264
    # Run Prithvi-EO-V2-300M-TL-Sen1Floods11
265
266
267
    pred_imgs = []
    for x in windows:
        # Apply standardization
268
269
        x = datamodule.test_transform(image=x.squeeze().numpy().transpose(1, 2, 0))
        x = datamodule.aug(x)["image"]
270
271
272
273
274

        with torch.no_grad():
            pred = model.run(x, location_coords=location_coords)
        y_hat = pred.argmax(dim=1)

275
276
277
        y_hat = torch.nn.functional.interpolate(
            y_hat.unsqueeze(1).float(), size=img_size, mode="nearest"
        )
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305

        pred_imgs.append(y_hat)

    pred_imgs = torch.concat(pred_imgs, dim=0)

    # Build images from patches
    pred_imgs = rearrange(
        pred_imgs,
        "(b h1 w1) c h w -> b c (h1 h) (w1 w)",
        h=img_size,
        w=img_size,
        b=1,
        c=1,
        h1=h1,
        w1=w1,
    )

    # Cut padded area back to original size
    pred_imgs = pred_imgs[..., :original_h, :original_w]

    # Squeeze (batch size 1)
    pred_imgs = pred_imgs[0]

    return pred_imgs


def main(
    data_file: str,
306
    model: str,
307
308
309
310
311
312
    output_dir: str,
    rgb_outputs: bool,
    input_indices: list[int] = None,
):
    os.makedirs(output_dir, exist_ok=True)

313
    model_obj = PrithviMAE(model=model)
314
    datamodule = generate_datamodule()
315
    img_size = 512  # Size of Sen1Floods11
316
317
318
319
320
321
322
323
324
325
326
327

    input_data, temporal_coords, location_coords, meta_data = load_example(
        file_paths=[data_file],
        indices=input_indices,
    )

    meta_data = meta_data[0]  # only one image

    if input_data.mean() > 1:
        input_data = input_data / 10000  # Convert to range 0-1

    channels = [
328
        datamodule_config["bands"].index(b) for b in ["RED", "GREEN", "BLUE"]
329
330
    ]  # BGR -> RGB

331
332
333
    pred = run_model(
        input_data, temporal_coords, location_coords, model_obj, datamodule, img_size
    )
334
335
336
    # Save pred
    meta_data.update(count=1, dtype="uint8", compress="lzw", nodata=0)
    pred_file = os.path.join(
337
338
        output_dir, f"pred_{os.path.splitext(os.path.basename(data_file))[0]}.tiff"
    )
339
340
341
342
343
344
345
346
347
348
349
350
    save_geotiff(_convert_np_uint8(pred), pred_file, meta_data)

    # Save image + pred
    meta_data.update(count=3, dtype="uint8", compress="lzw", nodata=0)

    if input_data.mean() < 1:
        input_data = input_data * 10000  # Scale to 0-10000

    rgb_orig = process_channel_group(
        orig_img=torch.Tensor(input_data[0, :, 0, ...]),
        channels=channels,
    )
351
    rgb_orig = rgb_orig.to(torch.float32)
352

353
    pred[pred == 0.0] = np.nan
354
355
356
357
    img_pred = rgb_orig * 0.7 + pred * 0.3
    img_pred[img_pred.isnan()] = rgb_orig[img_pred.isnan()]

    img_pred_file = os.path.join(
358
359
        output_dir, f"rgb_pred_{os.path.splitext(os.path.basename(data_file))[0]}.tiff"
    )
360
361
362
363
364
365
366
367
    save_geotiff(
        image=_convert_np_uint8(img_pred),
        output_path=img_pred_file,
        meta=meta_data,
    )

    # Save image rgb
    if rgb_outputs:
368
        name_suffix = os.path.splitext(os.path.basename(data_file))[0]
369
        rgb_file = os.path.join(
370
            output_dir,
371
            f"original_rgb_{name_suffix}.tiff",
372
        )
373
374
375
376
377
378
379
380
        save_geotiff(
            image=_convert_np_uint8(rgb_orig),
            output_path=rgb_file,
            meta=meta_data,
        )


if __name__ == "__main__":
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
    parser = argparse.ArgumentParser("MAE run inference", add_help=False)

    parser.add_argument(
        "--data_file",
        type=str,
        default="./India_900498_S2Hand.tif",
        help="Path to the file.",
    )
    parser.add_argument(
        "--model",
        type=str,
        default="christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM",
        help="Path to a checkpoint file to load from.",
    )
    parser.add_argument(
        "--output_dir",
        type=str,
        default="output",
        help="Path to the directory where to save outputs.",
    )
    parser.add_argument(
        "--input_indices",
        default=[1, 2, 3, 8, 11, 12],
        type=int,
        nargs="+",
        help="""
        0-based indices of the six Prithvi channels to be selected from the input.
        By default selects [1,2,3,8,11,12] for S2L1C data.
        """,
    )
    parser.add_argument(
        "--rgb_outputs",
        action="store_true",
        help="If present, output files will only contain RGB channels. "
        "Otherwise, all bands will be saved.",
    )
    args = parser.parse_args()
418
419

    main(**vars(args))