Removing prototype related things from release/0.14 branch (#6687)

* Remove test related to prototype * Remove torchvision/prototype dir * Remove references/depth/stereo because it depend on prototype * Remove prototype related entries on mypy.ini * Remove things related to prototype in pytest.ini * clean setup.py from prototype * Clean CI from prototype * Remove unused expect file

Removing prototype related things from release/0.14 branch (#6687)
* Remove test related to prototype * Remove torchvision/prototype dir * Remove references/depth/stereo because it depend on prototype * Remove prototype related entries on mypy.ini * Remove things related to prototype in pytest.ini * clean setup.py from prototype * Clean CI from prototype * Remove unused expect file
673838f5 · YosuaMichael · GitHub · 07ae61bf · 673838f5 · 673838f5
Unverified Commit 673838f5 authored Oct 04, 2022 by YosuaMichael Committed by GitHub Oct 04, 2022
20 changed files
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -152,15 +152,6 @@ commands:
          args: --no-build-isolation <<# parameters.editable >> --editable <</ parameters.editable >> .
          descr: Install torchvision <<# parameters.editable >> in editable mode <</ parameters.editable >>

-  install_prototype_dependencies:
-    steps:
-      - pip_install:
-          args: iopath
-          descr: Install third-party dependencies
-      - pip_install:
-          args: --pre torchdata --extra-index-url https://download.pytorch.org/whl/nightly/cpu
-          descr: Install torchdata from nightly releases
-
  # Most of the test suite is handled by the `unittest` jobs, with completely different workflow and setup.
  # This command can be used if only a selection of tests need to be run, for ad-hoc files.
  run_tests_selective:
@@ -326,7 +317,6 @@ jobs:
      - checkout
      - install_torchvision:
          editable: true
-      - install_prototype_dependencies
      - pip_install:
          args: mypy
          descr: Install Python type check utilities

--- a/.circleci/config.yml.in
+++ b/.circleci/config.yml.in
@@ -152,15 +152,6 @@ commands:
          args: --no-build-isolation <<# parameters.editable >> --editable <</ parameters.editable >> .
          descr: Install torchvision <<# parameters.editable >> in editable mode <</ parameters.editable >>

-  install_prototype_dependencies:
-    steps:
-      - pip_install:
-          args: iopath
-          descr: Install third-party dependencies
-      - pip_install:
-          args: --pre torchdata --extra-index-url https://download.pytorch.org/whl/nightly/cpu
-          descr: Install torchdata from nightly releases
-
  # Most of the test suite is handled by the `unittest` jobs, with completely different workflow and setup.
  # This command can be used if only a selection of tests need to be run, for ad-hoc files.
  run_tests_selective:
@@ -326,7 +317,6 @@ jobs:
      - checkout
      - install_torchvision:
          editable: true
-      - install_prototype_dependencies
      - pip_install:
          args: mypy
          descr: Install Python type check utilities

--- a/.github/workflows/prototype-tests.yml
+++ b/.github/workflows/prototype-tests.yml
-name: tests
-
-on:
-  pull_request:
-
-jobs:
-  prototype:
-    strategy:
-      matrix:
-        os:
-          - ubuntu-latest
-          - windows-latest
-          - macos-latest
-      fail-fast: false
-
-    runs-on: ${{ matrix.os }}
-
-    steps:
-      - name: Set up python
-        uses: actions/setup-python@v3
-        with:
-          python-version: 3.7
-
-      - name: Upgrade system packages
-        run: python -m pip install --upgrade pip setuptools wheel
-
-      - name: Checkout repository
-        uses: actions/checkout@v3
-
-      - name: Install PyTorch nightly builds
-        run: pip install --progress-bar=off --pre torch torchdata --extra-index-url https://download.pytorch.org/whl/nightly/cpu/
-
-      - name: Install torchvision
-        run: pip install --progress-bar=off --no-build-isolation --editable .
-
-      - name: Install other prototype dependencies
-        run: pip install --progress-bar=off scipy pycocotools h5py iopath
-
-      - name: Install test requirements
-        run: pip install --progress-bar=off pytest pytest-mock pytest-cov
-
-      - name: Mark setup as complete
-        id: setup
-        run: exit 0
-
-      - name: Run prototype features tests
-        shell: bash
-        run: |
-          pytest \
-            --durations=20 \
-            --cov=torchvision/prototype/features \
-            --cov-report=term-missing \
-            test/test_prototype_features*.py
-
-      - name: Run prototype datasets tests
-        if: success() || ( failure() && steps.setup.conclusion == 'success' )
-        shell: bash
-        run: |
-          pytest \
-            --durations=20 \
-            --cov=torchvision/prototype/datasets \
-            --cov-report=term-missing \
-            test/test_prototype_datasets*.py
-
-      - name: Run prototype transforms tests
-        if: success() || ( failure() && steps.setup.conclusion == 'success' )
-        shell: bash
-        run: |
-          pytest \
-            --durations=20 \
-            --cov=torchvision/prototype/transforms \
-            --cov-report=term-missing \
-            test/test_prototype_transforms*.py
-
-      - name: Run prototype models tests
-        if: success() || ( failure() && steps.setup.conclusion == 'success' )
-        shell: bash
-        run: |
-          pytest \
-            --durations=20 \
-            --cov=torchvision/prototype/models \
-            --cov-report=term-missing \
-            test/test_prototype_models*.py
--- a/mypy.ini
+++ b/mypy.ini
@@ -7,52 +7,6 @@ allow_redefinition = True
 no_implicit_optional = True
 warn_redundant_casts = True

-[mypy-torchvision.prototype.features.*]
-
-; untyped definitions and calls
-disallow_untyped_defs = True
-
-; None and Optional handling
-no_implicit_optional = True
-
-; warnings
-warn_unused_ignores = True
-warn_return_any = True
-
-; miscellaneous strictness flags
-allow_redefinition = True
-
-[mypy-torchvision.prototype.transforms.*]
-
-; untyped definitions and calls
-disallow_untyped_defs = True
-
-; None and Optional handling
-no_implicit_optional = True
-
-; warnings
-warn_unused_ignores = True
-warn_return_any = True
-
-; miscellaneous strictness flags
-allow_redefinition = True
-
-[mypy-torchvision.prototype.datasets.*]
-
-; untyped definitions and calls
-disallow_untyped_defs = True
-
-; None and Optional handling
-no_implicit_optional = True
-
-; warnings
-warn_unused_ignores = True
-warn_return_any = True
-warn_unreachable = True
-
-; miscellaneous strictness flags
-allow_redefinition = True
-
 [mypy-torchvision.io.image.*]

 ignore_errors = True
@@ -149,10 +103,6 @@ ignore_missing_imports = True

 ignore_missing_imports = True

-[mypy-torchdata.*]
-
-ignore_missing_imports = True
-
 [mypy-h5py.*]

 ignore_missing_imports = True
--- a/pytest.ini
+++ b/pytest.ini
@@ -7,7 +7,6 @@ addopts =
    # enable all warnings
    -Wd
    --ignore=test/test_datasets_download.py
-    --ignore-glob=test/test_prototype_*.py
 testpaths =
    test
 xfail_strict = True
--- a/references/depth/stereo/README.md
+++ b/references/depth/stereo/README.md
-# Stereo Matching reference training scripts
-
-This folder contains reference training scripts for Stereo Matching.
-They serve as a log of how to train specific models, so as to provide baseline
-training and evaluation scripts to quickly bootstrap research.
-
-
-### CREStereo
-
-The CREStereo model was trained on a dataset mixture between **CREStereo**, **ETH3D** and the additional split from **Middlebury2014**.
-A ratio of **88-6-6** was used in order to train a baseline weight set. We provide multi-set variant as well.
-Both used 8 A100 GPUs and a batch size of 2 (so effective batch size is 16). The
-rest of the hyper-parameters loosely follow the recipe from https://github.com/megvii-research/CREStereo.
-The original recipe trains for **300000** updates (or steps) on the dataset mixture. We modify the learning rate
-schedule to one that starts decaying the weight much sooner. Throughout experiments we found that this reduces overfitting
-during evaluation time and gradient clip help stabilize the loss during a pre-mature learning rate change.
-
-```
-torchrun --nproc_per_node 8 --nnodes 1 train.py \
-    --dataset-root $dataset_root \
-    --name $name_cre \
-    --model crestereo_base \
-    --train-datasets crestereo eth3d-train middlebury2014-other \
-    --dataset-steps 264000 18000 18000
-    --batch-size 2 \
-    --lr 0.0004 \
-    --min-lr 0.00002 \
-    --lr-decay-method cosine \
-    --warmup-steps 6000 \
-    --decay-after-steps 30000 \
-    --clip-grad-norm 1.0 \
-```
-
-We employ a multi-set fine-tuning stage where we uniformly sample from multiple datasets. Given hat some of these datasets have extremely large images (``2048x2048`` or more) we opt for a very aggresive scale-range ``[0.2 - 0.8]`` such that as much of the original frame composition is captured inside the ``384x512`` crop.
-
-```
-torchrun --nproc_per_node 8 --nnodes 1 train.py \
-    --dataset-root $dataset_root \
-    --name $name_things \
-    --model crestereo_base \
-    --train-datasets crestereo eth3d-train middlebury2014-other instereo2k fallingthings carla-highres sintel sceneflow-monkaa sceneflow-driving \
-    --dataset-steps 12000 12000 12000 12000 12000 12000 12000 12000 12000
-    --batch-size 2 \
-    --scale-range 0.2 0.8 \
-    --lr 0.0004 \
-    --lr-decay-method cosine \
-    --decay-after-steps 0 \
-    --warmup-steps 0 \
-    --min-lr 0.00002 \
-    --resume-path $checkpoint_dir/$name_cre.pth
-```
-
-
-### Evaluation
-
-Evaluating the base weights
-
-```
-torchrun --nproc_per_node 1 --nnodes 1 cascade_evaluation.py --dataset middlebury2014-train --batch-size 1 --dataset-root $dataset_root --model crestereo_base --weights CREStereo_Base_Weights.CRESTEREO_ETH_MBL_V1
-```
-
-This should give an **mae of about 1.416** on the train set of `Middlebury2014`. Results may vary slightly depending on the batch size and the number of GPUs. For the most accurate resuts use 1 GPU and `--batch-size 1`. The created log file should look like this, where the first key is the number of cascades and the nested key is the number of recursive iterations:
-
-```
-Dataset: middlebury2014-train @size: [384, 512]:
-{
-	1: {
-		2: {'mae': 2.363, 'rmse': 4.352, '1px': 0.611, '3px': 0.828, '5px': 0.891, 'relepe': 0.176, 'fl-all': 64.511}
-		5: {'mae': 1.618, 'rmse': 3.71, '1px': 0.761, '3px': 0.879, '5px': 0.918, 'relepe': 0.154, 'fl-all': 77.128}
-		10: {'mae': 1.416, 'rmse': 3.53, '1px': 0.777, '3px': 0.896, '5px': 0.933, 'relepe': 0.148, 'fl-all': 78.388}
-		20: {'mae': 1.448, 'rmse': 3.583, '1px': 0.771, '3px': 0.893, '5px': 0.931, 'relepe': 0.145, 'fl-all': 77.7}
-	},
-}
-{
-	2: {
-		2: {'mae': 1.972, 'rmse': 4.125, '1px': 0.73, '3px': 0.865, '5px': 0.908, 'relepe': 0.169, 'fl-all': 74.396}
-		5: {'mae': 1.403, 'rmse': 3.448, '1px': 0.793, '3px': 0.905, '5px': 0.937, 'relepe': 0.151, 'fl-all': 80.186}
-		10: {'mae': 1.312, 'rmse': 3.368, '1px': 0.799, '3px': 0.912, '5px': 0.943, 'relepe': 0.148, 'fl-all': 80.379}
-		20: {'mae': 1.376, 'rmse': 3.542, '1px': 0.796, '3px': 0.91, '5px': 0.942, 'relepe': 0.149, 'fl-all': 80.054}
-	},
-}
-```
-
-You can also evaluate the Finetuned weights:
-
-```
-torchrun --nproc_per_node 1 --nnodes 1 cascade_evaluation.py --dataset middlebury2014-train --batch-size 1 --dataset-root $dataset_root --model crestereo_base --weights CREStereo_Base_Weights.CRESTEREO_FINETUNE_MULTI_V1
-```
-
-```
-Dataset: middlebury2014-train @size: [384, 512]:
-{
-	1: {
-		2: {'mae': 1.85, 'rmse': 3.797, '1px': 0.673, '3px': 0.862, '5px': 0.917, 'relepe': 0.171, 'fl-all': 69.736}
-		5: {'mae': 1.111, 'rmse': 3.166, '1px': 0.838, '3px': 0.93, '5px': 0.957, 'relepe': 0.134, 'fl-all': 84.596}
-		10: {'mae': 1.02, 'rmse': 3.073, '1px': 0.854, '3px': 0.938, '5px': 0.96, 'relepe': 0.129, 'fl-all': 86.042}
-		20: {'mae': 0.993, 'rmse': 3.059, '1px': 0.855, '3px': 0.942, '5px': 0.967, 'relepe': 0.126, 'fl-all': 85.784}
-	},
-}
-{
-	2: {
-		2: {'mae': 1.667, 'rmse': 3.867, '1px': 0.78, '3px': 0.891, '5px': 0.922, 'relepe': 0.165, 'fl-all': 78.89}
-		5: {'mae': 1.158, 'rmse': 3.278, '1px': 0.843, '3px': 0.926, '5px': 0.955, 'relepe': 0.135, 'fl-all': 84.556}
-		10: {'mae': 1.046, 'rmse': 3.13, '1px': 0.85, '3px': 0.934, '5px': 0.96, 'relepe': 0.13, 'fl-all': 85.464}
-		20: {'mae': 1.021, 'rmse': 3.102, '1px': 0.85, '3px': 0.935, '5px': 0.963, 'relepe': 0.129, 'fl-all': 85.417}
-	},
-}
-```
-
-Evaluating the author provided weights:
-
-```
-torchrun --nproc_per_node 1 --nnodes 1 cascade_evaluation.py --dataset middlebury2014-train --batch-size 1 --dataset-root $dataset_root --model crestereo_base --weights CREStereo_Base_Weights.MEGVII_V1
-```
-
-```
-Dataset: middlebury2014-train @size: [384, 512]:
-{
-	1: {
-		2: {'mae': 1.704, 'rmse': 3.738, '1px': 0.738, '3px': 0.896, '5px': 0.933, 'relepe': 0.157, 'fl-all': 76.464}
-		5: {'mae': 0.956, 'rmse': 2.963, '1px': 0.88, '3px': 0.948, '5px': 0.965, 'relepe': 0.124, 'fl-all': 88.186}
-		10: {'mae': 0.792, 'rmse': 2.765, '1px': 0.905, '3px': 0.958, '5px': 0.97, 'relepe': 0.114, 'fl-all': 90.429}
-		20: {'mae': 0.749, 'rmse': 2.706, '1px': 0.907, '3px': 0.961, '5px': 0.972, 'relepe': 0.113, 'fl-all': 90.807}
-	},
-}
-{
-	2: {
-		2: {'mae': 1.702, 'rmse': 3.784, '1px': 0.784, '3px': 0.894, '5px': 0.924, 'relepe': 0.172, 'fl-all': 80.313}
-		5: {'mae': 0.932, 'rmse': 2.907, '1px': 0.877, '3px': 0.944, '5px': 0.963, 'relepe': 0.125, 'fl-all': 87.979}
-		10: {'mae': 0.773, 'rmse': 2.768, '1px': 0.901, '3px': 0.958, '5px': 0.972, 'relepe': 0.117, 'fl-all': 90.43}
-		20: {'mae': 0.854, 'rmse': 2.971, '1px': 0.9, '3px': 0.957, '5px': 0.97, 'relepe': 0.122, 'fl-all': 90.269}
-	},
-}
-```
-
-# Concerns when training
-
-We encourage users to be aware of the **aspect-ratio** and **disparity scale** they are targetting when doing any sort of training or fine-tuning. The model is highly sensitive to these two factors, as a consequence with naive multi-set fine-tuning one can achieve `0.2 mae` relatively fast. We recommend that users pay close attention to how they **balance dataset sizing** when training such networks.
-
- Ideally, dataset scaling should be trated at an individual level and a thorough **EDA** of the disparity distribution in random crops at the desired training / inference size should be performed prior to any large compute investments.
-
-### Disparity scaling
-
-##### Sample A
- The top row contains a sample from `Sintel` whereas the bottom row one from `Middlebury`.
-
-![Disparity1](assets/disparity-domain-drift.jpg)
-
-From left to right (`left_image`, `right_image`, `valid_mask`, `valid_mask & ground_truth`, `prediction`). **Darker is further away, lighter is closer**. In the case of `Sintel` which is more closely aligned to the original distribution of `CREStereo` we notice that the model accurately predicts the background scale whereas in the case of `Middlebury2014` it cannot correcly estimate the continous disparity. Notice that the frame composition is similar for both examples. The blue skybox in the `Sintel` scene behaves similarly to the `Middlebury` black background. However, because the `Middlebury` samples comes from an extremly large scene the crop size of `384x512` does not correctly capture the general training distribution.
-
-
-
-
-##### Sample B
-
-The top row contains a scene from `Sceneflow` using the `Monkaa` split whilst the bottom row is a scene from `Middlebury`. This sample exhibits the same issues when it comes to **background estimation**. Given the exagerated size of the `Middlebury` samples the model **colapses the smooth background** of the sample to what it considers to be a mean background disparity value.
-
-![Disparity2](assets/disparity-background-mode-collapse.jpg)
-
-
-For more detail on why this behaviour occurs based on the training distribution proportions you can read more about the network at: https://github.com/pytorch/vision/pull/6629#discussion_r978160493
-
-
-### Metric overfitting
-
-##### Learning is critical in the beginning
-
-We also advise users to make user of faster training schedules, as the performance gain over long periods time is marginal. Here we exhibit a difference between a faster decay schedule and later decay schedule.
-
-![Loss1](assets/Loss.jpg)
-
-In **grey** we set the lr decay to begin after `30000` steps whilst in **orange** we opt for a very late learning rate decay at around `180000` steps. Although exhibiting stronger variance, we can notice that unfreezing the learning rate earlier whilst employing `gradient-norm` out-performs the default configuration.
-
-##### Gradient norm saves time
-
-![Loss2](assets/gradient-norm-removal.jpg)
-
-In **grey** we keep ``gradient norm`` enabled whilst in **orange** we do not. We can notice that remvoing the gradient norm exacerbates the performance decrease in the early stages whilst also showcasing an almost complete collapse around the `60000` steps mark where we started decaying the lr for **orange**.
-
-Although both runs ahieve an improvement of about ``0.1`` mae after the lr decay start, the benefits of it are observable much faster when ``gradient norm`` is employed as the recovery period is no longer accounted for.
--- a/references/depth/stereo/__init__.py
+++ b/references/depth/stereo/__init__.py
--- a/references/depth/stereo/assets/Loss.jpg
+++ b/references/depth/stereo/assets/Loss.jpg
--- a/references/depth/stereo/assets/disparity-background-mode-collapse.jpg
+++ b/references/depth/stereo/assets/disparity-background-mode-collapse.jpg
--- a/references/depth/stereo/assets/disparity-domain-drift.jpg
+++ b/references/depth/stereo/assets/disparity-domain-drift.jpg
--- a/references/depth/stereo/assets/gradient-norm-removal.jpg
+++ b/references/depth/stereo/assets/gradient-norm-removal.jpg
--- a/references/depth/stereo/cascade_evaluation.py
+++ b/references/depth/stereo/cascade_evaluation.py
-import os
-import warnings
-
-import torch
-import torchvision
-import torchvision.prototype.models.depth.stereo
-import utils
-from torch.nn import functional as F
-from train import make_eval_loader
-
-from utils.metrics import AVAILABLE_METRICS
-from vizualization import make_prediction_image_side_to_side
-
-
-def get_args_parser(add_help=True):
-    import argparse
-
-    parser = argparse.ArgumentParser(description="PyTorch Stereo Matching Evaluation", add_help=add_help)
-    parser.add_argument("--dataset", type=str, default="middlebury2014-train", help="dataset to use")
-    parser.add_argument("--dataset-root", type=str, default="", help="root of the dataset")
-
-    parser.add_argument("--checkpoint", type=str, default="", help="path to weights")
-    parser.add_argument("--weights", type=str, default=None, help="torchvision API weight")
-    parser.add_argument(
-        "--model",
-        type=str,
-        default="crestereo_base",
-        help="which model to use if not speciffying a training checkpoint",
-    )
-    parser.add_argument("--img-folder", type=str, default="images")
-
-    parser.add_argument("--batch-size", type=int, default=1, help="batch size")
-    parser.add_argument("--workers", type=int, default=0, help="number of workers")
-
-    parser.add_argument("--eval-size", type=int, nargs="+", default=[384, 512], help="resize size")
-    parser.add_argument(
-        "--norm-mean", type=float, nargs="+", default=[0.5, 0.5, 0.5], help="mean for image normalization"
-    )
-    parser.add_argument(
-        "--norm-std", type=float, nargs="+", default=[0.5, 0.5, 0.5], help="std for image normalization"
-    )
-    parser.add_argument(
-        "--use-grayscale", action="store_true", help="use grayscale images instead of RGB", default=False
-    )
-    parser.add_argument("--max-disparity", type=float, default=None, help="maximum disparity")
-    parser.add_argument(
-        "--interpolation-strategy",
-        type=str,
-        default="bilinear",
-        help="interpolation strategy",
-        choices=["bilinear", "bicubic", "mixed"],
-    )
-
-    parser.add_argument("--n_iterations", nargs="+", type=int, default=[10], help="number of recurent iterations")
-    parser.add_argument("--n_cascades", nargs="+", type=int, default=[1], help="number of cascades")
-    parser.add_argument(
-        "--metrics",
-        type=str,
-        nargs="+",
-        default=["mae", "rmse", "1px", "3px", "5px", "relepe"],
-        help="metrics to log",
-        choices=AVAILABLE_METRICS,
-    )
-    parser.add_argument("--mixed-precision", action="store_true", help="use mixed precision training")
-
-    parser.add_argument("--world-size", type=int, default=1, help="number of distributed processes")
-    parser.add_argument("--dist-url", type=str, default="env://", help="url used to set up distributed training")
-    parser.add_argument("--device", type=str, default="cuda", help="device to use for training")
-
-    parser.add_argument("--save-images", action="store_true", help="save images of the predictions")
-    parser.add_argument("--padder-type", type=str, default="kitti", help="padder type", choices=["kitti", "sintel"])
-
-    return parser
-
-
-def cascade_inference(model, image_left, image_right, iterations, cascades):
-    # check that image size is divisible by 16 * (2 ** (cascades - 1))
-    for image in [image_left, image_right]:
-        if image.shape[-2] % ((2 ** (cascades - 1))) != 0:
-            raise ValueError(
-                f"image height is not divisible by {16 * (2 ** (cascades - 1))}. Image shape: {image.shape[-2]}"
-            )
-
-        if image.shape[-1] % ((2 ** (cascades - 1))) != 0:
-            raise ValueError(
-                f"image width is not divisible by {16 * (2 ** (cascades - 1))}. Image shape: {image.shape[-2]}"
-            )
-
-    left_image_pyramid = [image_left]
-    right_image_pyramid = [image_right]
-    for idx in range(0, cascades - 1):
-        ds_factor = int(2 ** (idx + 1))
-        ds_shape = (image_left.shape[-2] // ds_factor, image_left.shape[-1] // ds_factor)
-        left_image_pyramid += F.interpolate(image_left, size=ds_shape, mode="bilinear", align_corners=True).unsqueeze(0)
-        right_image_pyramid += F.interpolate(image_right, size=ds_shape, mode="bilinear", align_corners=True).unsqueeze(
-            0
-        )
-
-    flow_init = None
-    for left_image, right_image in zip(reversed(left_image_pyramid), reversed(right_image_pyramid)):
-        flow_pred = model(left_image, right_image, flow_init, num_iters=iterations)
-        # flow pred is a list
-        flow_init = flow_pred[-1]
-
-    return flow_init
-
-
-@torch.inference_mode()
-def _evaluate(
-    model,
-    args,
-    val_loader,
-    *,
-    padder_mode,
-    print_freq=10,
-    writter=None,
-    step=None,
-    iterations=10,
-    cascades=1,
-    batch_size=None,
-    header=None,
-    save_images=False,
-    save_path="",
-):
-    """Helper function to compute various metrics (epe, etc.) for a model on a given dataset.
-    We process as many samples as possible with ddp.
-    """
-    model.eval()
-    header = header or "Test:"
-    device = torch.device(args.device)
-    metric_logger = utils.MetricLogger(delimiter="  ")
-
-    iterations = iterations or args.recurrent_updates
-
-    logger = utils.MetricLogger()
-    for meter_name in args.metrics:
-        logger.add_meter(meter_name, fmt="{global_avg:.4f}")
-    if "fl-all" not in args.metrics:
-        logger.add_meter("fl-all", fmt="{global_avg:.4f}")
-
-    num_processed_samples = 0
-    with torch.cuda.amp.autocast(enabled=args.mixed_precision, dtype=torch.float16):
-        batch_idx = 0
-        for blob in metric_logger.log_every(val_loader, print_freq, header):
-            image_left, image_right, disp_gt, valid_disp_mask = (x.to(device) for x in blob)
-            padder = utils.InputPadder(image_left.shape, mode=padder_mode)
-            image_left, image_right = padder.pad(image_left, image_right)
-
-            disp_pred = cascade_inference(model, image_left, image_right, iterations, cascades)
-            disp_pred = disp_pred[:, :1, :, :]
-            disp_pred = padder.unpad(disp_pred)
-
-            if save_images:
-                if args.distributed:
-                    rank_prefix = args.rank
-                else:
-                    rank_prefix = 0
-                make_prediction_image_side_to_side(
-                    disp_pred, disp_gt, valid_disp_mask, save_path, prefix=f"batch_{rank_prefix}_{batch_idx}"
-                )
-
-            metrics, _ = utils.compute_metrics(disp_pred, disp_gt, valid_disp_mask, metrics=logger.meters.keys())
-            num_processed_samples += image_left.shape[0]
-            for name in metrics:
-                logger.meters[name].update(metrics[name], n=1)
-
-            batch_idx += 1
-
-    num_processed_samples = utils.reduce_across_processes(num_processed_samples) / args.world_size
-
-    print("Num_processed_samples: ", num_processed_samples)
-    if (
-        hasattr(val_loader.dataset, "__len__")
-        and len(val_loader.dataset) != num_processed_samples
-        and torch.distributed.get_rank() == 0
-    ):
-        warnings.warn(
-            f"Number of processed samples {num_processed_samples} is different"
-            f"from the dataset size {len(val_loader.dataset)}. This may happen if"
-            "the dataset is not divisible by the batch size. Try lowering the batch size for more accurate results."
-        )
-
-    if writter is not None and args.rank == 0:
-        for meter_name, meter_value in logger.meters.items():
-            scalar_name = f"{meter_name} {header}"
-            writter.add_scalar(scalar_name, meter_value.avg, step)
-
-    logger.synchronize_between_processes()
-    print(header, logger)
-
-    logger_metrics = {k: v.global_avg for k, v in logger.meters.items()}
-    return logger_metrics
-
-
-def evaluate(model, loader, args, writter=None, step=None):
-    os.makedirs(args.img_folder, exist_ok=True)
-    checkpoint_name = os.path.basename(args.checkpoint) or args.weights
-    image_checkpoint_folder = os.path.join(args.img_folder, checkpoint_name)
-
-    metrics = {}
-    base_image_folder = os.path.join(image_checkpoint_folder, args.dataset)
-    os.makedirs(base_image_folder, exist_ok=True)
-
-    for n_cascades in args.n_cascades:
-        for n_iters in args.n_iterations:
-
-            config = f"{n_cascades}c_{n_iters}i"
-            config_image_folder = os.path.join(base_image_folder, config)
-            os.makedirs(config_image_folder, exist_ok=True)
-
-            metrics[config] = _evaluate(
-                model,
-                args,
-                loader,
-                padder_mode=args.padder_type,
-                header=f"{args.dataset} evaluation@ size:{args.eval_size} n_cascades:{n_cascades} n_iters:{n_iters}",
-                batch_size=args.batch_size,
-                writter=writter,
-                step=step,
-                iterations=n_iters,
-                cascades=n_cascades,
-                save_path=config_image_folder,
-                save_images=args.save_images,
-            )
-
-    metric_log = []
-    metric_log_dict = {}
-    # print the final results
-    for config in metrics:
-        config_tokens = config.split("_")
-        config_iters = config_tokens[1][:-1]
-        config_cascades = config_tokens[0][:-1]
-
-        metric_log_dict[config_cascades] = metric_log_dict.get(config_cascades, {})
-        metric_log_dict[config_cascades][config_iters] = metrics[config]
-
-        evaluation_str = f"{args.dataset} evaluation@ size:{args.eval_size} n_cascades:{config_cascades} recurrent_updates:{config_iters}"
-        metrics_str = f"Metrics: {metrics[config]}"
-        metric_log.extend([evaluation_str, metrics_str])
-
-        print(evaluation_str)
-        print(metrics_str)
-
-    eval_log_name = f"{checkpoint_name.replace('.pth', '')}_eval.log"
-    print("Saving eval log to: ", eval_log_name)
-    with open(eval_log_name, "w") as f:
-        f.write(f"Dataset: {args.dataset} @size: {args.eval_size}:\n")
-        # write the dict line by line for each key, and each value in the keys
-        for config_cascades in metric_log_dict:
-            f.write("{\n")
-            f.write(f"\t{config_cascades}: {{\n")
-            for config_iters in metric_log_dict[config_cascades]:
-                # convert every metric to 4 decimal places
-                metrics = metric_log_dict[config_cascades][config_iters]
-                metrics = {k: float(f"{v:.3f}") for k, v in metrics.items()}
-                f.write(f"\t\t{config_iters}: {metrics}\n")
-            f.write("\t},\n")
-            f.write("}\n")
-
-
-def load_checkpoint(args):
-    utils.setup_ddp(args)
-
-    if not args.weights:
-        checkpoint = torch.load(args.checkpoint, map_location=torch.device("cpu"))
-        if "model" in checkpoint:
-            experiment_args = checkpoint["args"]
-            model = torchvision.prototype.models.depth.stereo.__dict__[experiment_args.model](weights=None)
-            model.load_state_dict(checkpoint["model"])
-        else:
-            model = torchvision.prototype.models.depth.stereo.__dict__[args.model](weights=None)
-            model.load_state_dict(checkpoint)
-
-        # set the appropiate devices
-        if args.distributed and args.device == "cpu":
-            raise ValueError("The device must be cuda if we want to run in distributed mode using torchrun")
-        device = torch.device(args.device)
-    else:
-        model = torchvision.prototype.models.depth.stereo.__dict__[args.model](weights=args.weights)
-
-    # convert to DDP if need be
-    if args.distributed:
-        model = model.to(args.device)
-        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
-    else:
-        model.to(device)
-
-    return model
-
-
-def main(args):
-    model = load_checkpoint(args)
-    loader = make_eval_loader(args.dataset, args)
-    evaluate(model, loader, args)
-
-
-if __name__ == "__main__":
-    args = get_args_parser().parse_args()
-    main(args)
--- a/references/depth/stereo/parsing.py
+++ b/references/depth/stereo/parsing.py
-import argparse
-from functools import partial
-
-import torch
-
-from presets import StereoMatchingEvalPreset, StereoMatchingTrainPreset
-from torchvision.datasets import (
-    CarlaStereo,
-    CREStereo,
-    ETH3DStereo,
-    FallingThingsStereo,
-    InStereo2k,
-    Kitti2012Stereo,
-    Kitti2015Stereo,
-    Middlebury2014Stereo,
-    SceneFlowStereo,
-    SintelStereo,
-)
-
-VALID_DATASETS = {
-    "crestereo": partial(CREStereo),
-    "carla-highres": partial(CarlaStereo),
-    "instereo2k": partial(InStereo2k),
-    "sintel": partial(SintelStereo),
-    "sceneflow-monkaa": partial(SceneFlowStereo, variant="Monkaa", pass_name="both"),
-    "sceneflow-flyingthings": partial(SceneFlowStereo, variant="FlyingThings3D", pass_name="both"),
-    "sceneflow-driving": partial(SceneFlowStereo, variant="Driving", pass_name="both"),
-    "fallingthings": partial(FallingThingsStereo, variant="both"),
-    "eth3d-train": partial(ETH3DStereo, split="train"),
-    "eth3d-test": partial(ETH3DStereo, split="test"),
-    "kitti2015-train": partial(Kitti2015Stereo, split="train"),
-    "kitti2015-test": partial(Kitti2015Stereo, split="test"),
-    "kitti2012-train": partial(Kitti2012Stereo, split="train"),
-    "kitti2012-test": partial(Kitti2012Stereo, split="train"),
-    "middlebury2014-other": partial(
-        Middlebury2014Stereo, split="additional", use_ambient_view=True, calibration="both"
-    ),
-    "middlebury2014-train": partial(Middlebury2014Stereo, split="train", calibration="perfect"),
-    "middlebury2014-test": partial(Middlebury2014Stereo, split="test", calibration=None),
-    "middlebury2014-train-ambient": partial(
-        Middlebury2014Stereo, split="train", use_ambient_views=True, calibrartion="perfect"
-    ),
-}
-
-
-def make_train_transform(args: argparse.Namespace) -> torch.nn.Module:
-    return StereoMatchingTrainPreset(
-        resize_size=args.resize_size,
-        crop_size=args.crop_size,
-        rescale_prob=args.rescale_prob,
-        scaling_type=args.scaling_type,
-        scale_range=args.scale_range,
-        scale_interpolation_type=args.interpolation_strategy,
-        use_grayscale=args.use_grayscale,
-        mean=args.norm_mean,
-        std=args.norm_std,
-        horizontal_flip_prob=args.flip_prob,
-        gpu_transforms=args.gpu_transforms,
-        max_disparity=args.max_disparity,
-        spatial_shift_prob=args.spatial_shift_prob,
-        spatial_shift_max_angle=args.spatial_shift_max_angle,
-        spatial_shift_max_displacement=args.spatial_shift_max_displacement,
-        spatial_shift_interpolation_type=args.interpolation_strategy,
-        gamma_range=args.gamma_range,
-        brightness=args.brightness_range,
-        contrast=args.contrast_range,
-        saturation=args.saturation_range,
-        hue=args.hue_range,
-        asymmetric_jitter_prob=args.asymmetric_jitter_prob,
-    )
-
-
-def make_eval_transform(args: argparse.Namespace) -> torch.nn.Module:
-    if args.eval_size is None:
-        resize_size = args.crop_size
-    else:
-        resize_size = args.eval_size
-
-    return StereoMatchingEvalPreset(
-        mean=args.norm_mean,
-        std=args.norm_std,
-        use_grayscale=args.use_grayscale,
-        resize_size=resize_size,
-        interpolation_type=args.interpolation_strategy,
-    )
-
-
-def make_dataset(dataset_name: str, dataset_root: str, transforms: torch.nn.Module) -> torch.utils.data.Dataset:
-    return VALID_DATASETS[dataset_name](root=dataset_root, transforms=transforms)
--- a/references/depth/stereo/presets.py
+++ b/references/depth/stereo/presets.py
-from typing import Optional, Tuple, Union
-
-import torch
-import transforms as T
-
-
-class StereoMatchingEvalPreset(torch.nn.Module):
-    def __init__(
-        self,
-        mean: float = 0.5,
-        std: float = 0.5,
-        resize_size: Optional[Tuple[int, ...]] = None,
-        max_disparity: Optional[float] = None,
-        interpolation_type: str = "bilinear",
-        use_grayscale: bool = False,
-    ) -> None:
-        super().__init__()
-
-        transforms = [
-            T.ToTensor(),
-            T.ConvertImageDtype(torch.float32),
-        ]
-
-        if use_grayscale:
-            transforms.append(T.ConvertToGrayscale())
-
-        if resize_size is not None:
-            transforms.append(T.Resize(resize_size, interpolation_type=interpolation_type))
-
-        transforms.extend(
-            [
-                T.Normalize(mean=mean, std=std),
-                T.MakeValidDisparityMask(max_disparity=max_disparity),
-                T.ValidateModelInput(),
-            ]
-        )
-
-        self.transforms = T.Compose(transforms)
-
-    def forward(self, images, disparities, masks):
-        return self.transforms(images, disparities, masks)
-
-
-class StereoMatchingTrainPreset(torch.nn.Module):
-    def __init__(
-        self,
-        *,
-        resize_size: Optional[Tuple[int, ...]],
-        resize_interpolation_type: str = "bilinear",
-        # RandomResizeAndCrop params
-        crop_size: Tuple[int, int],
-        rescale_prob: float = 1.0,
-        scaling_type: str = "exponential",
-        scale_range: Tuple[float, float] = (-0.2, 0.5),
-        scale_interpolation_type: str = "bilinear",
-        # convert to grayscale
-        use_grayscale: bool = False,
-        # normalization params
-        mean: float = 0.5,
-        std: float = 0.5,
-        # processing device
-        gpu_transforms: bool = False,
-        # masking
-        max_disparity: Optional[int] = 256,
-        # SpatialShift params
-        spatial_shift_prob: float = 0.5,
-        spatial_shift_max_angle: float = 0.5,
-        spatial_shift_max_displacement: float = 0.5,
-        spatial_shift_interpolation_type: str = "bilinear",
-        # AssymetricColorJitter
-        gamma_range: Tuple[float, float] = (0.8, 1.2),
-        brightness: Union[int, Tuple[int, int]] = (0.8, 1.2),
-        contrast: Union[int, Tuple[int, int]] = (0.8, 1.2),
-        saturation: Union[int, Tuple[int, int]] = 0.0,
-        hue: Union[int, Tuple[int, int]] = 0.0,
-        asymmetric_jitter_prob: float = 1.0,
-        # RandomHorizontalFlip
-        horizontal_flip_prob: float = 0.5,
-        # RandomOcclusion
-        occlusion_prob: float = 0.0,
-        occlusion_px_range: Tuple[int, int] = (50, 100),
-        # RandomErase
-        erase_prob: float = 0.0,
-        erase_px_range: Tuple[int, int] = (50, 100),
-        erase_num_repeats: int = 1,
-    ) -> None:
-
-        if scaling_type not in ["linear", "exponential"]:
-            raise ValueError(f"Unknown scaling type: {scaling_type}. Available types: linear, exponential")
-
-        super().__init__()
-        transforms = [T.ToTensor()]
-
-        # when fixing size across multiple datasets, we ensure
-        # that the same size is used for all datasets when cropping
-        if resize_size is not None:
-            transforms.append(T.Resize(resize_size, interpolation_type=resize_interpolation_type))
-
-        if gpu_transforms:
-            transforms.append(T.ToGPU())
-
-        # color handling
-        color_transforms = [
-            T.AsymmetricColorJitter(
-                brightness=brightness, contrast=contrast, saturation=saturation, hue=hue, p=asymmetric_jitter_prob
-            ),
-            T.AsymetricGammaAdjust(p=asymmetric_jitter_prob, gamma_range=gamma_range),
-        ]
-
-        if use_grayscale:
-            color_transforms.append(T.ConvertToGrayscale())
-
-        transforms.extend(color_transforms)
-
-        transforms.extend(
-            [
-                T.RandomSpatialShift(
-                    p=spatial_shift_prob,
-                    max_angle=spatial_shift_max_angle,
-                    max_px_shift=spatial_shift_max_displacement,
-                    interpolation_type=spatial_shift_interpolation_type,
-                ),
-                T.ConvertImageDtype(torch.float32),
-                T.RandomRescaleAndCrop(
-                    crop_size=crop_size,
-                    scale_range=scale_range,
-                    rescale_prob=rescale_prob,
-                    scaling_type=scaling_type,
-                    interpolation_type=scale_interpolation_type,
-                ),
-                T.RandomHorizontalFlip(horizontal_flip_prob),
-                # occlusion after flip, otherwise we're occluding the reference image
-                T.RandomOcclusion(p=occlusion_prob, occlusion_px_range=occlusion_px_range),
-                T.RandomErase(p=erase_prob, erase_px_range=erase_px_range, max_erase=erase_num_repeats),
-                T.Normalize(mean=mean, std=std),
-                T.MakeValidDisparityMask(max_disparity),
-                T.ValidateModelInput(),
-            ]
-        )
-
-        self.transforms = T.Compose(transforms)
-
-    def forward(self, images, disparties, mask):
-        return self.transforms(images, disparties, mask)
--- a/references/depth/stereo/train.py
+++ b/references/depth/stereo/train.py
-import argparse
-import os
-import warnings
-from pathlib import Path
-from typing import List, Union
-
-import numpy as np
-import torch
-import torch.distributed as dist
-import torchvision.models.optical_flow
-import torchvision.prototype.models.depth.stereo
-import utils
-import vizualization
-
-from parsing import make_dataset, make_eval_transform, make_train_transform, VALID_DATASETS
-from torch import nn
-from torchvision.transforms.functional import get_dimensions, InterpolationMode, resize
-from utils.metrics import AVAILABLE_METRICS
-from utils.norm import freeze_batch_norm
-
-
-def make_stereo_flow(flow: Union[torch.Tensor, List[torch.Tensor]], model_out_channels: int) -> torch.Tensor:
-    """Helper function to make stereo flow from a given model output"""
-    if isinstance(flow, list):
-        return [make_stereo_flow(flow_i, model_out_channels) for flow_i in flow]
-
-    B, C, H, W = flow.shape
-    # we need to add zero flow if the model outputs 2 channels
-    if C == 1 and model_out_channels == 2:
-        zero_flow = torch.zeros_like(flow)
-        # by convention the flow is X-Y axis, so we need the Y flow last
-        flow = torch.cat([flow, zero_flow], dim=1)
-    return flow
-
-
-def make_lr_schedule(args: argparse.Namespace, optimizer: torch.optim.Optimizer) -> np.ndarray:
-    """Helper function to return a learning rate scheduler for CRE-stereo"""
-    if args.decay_after_steps < args.warmup_steps:
-        raise ValueError(f"decay_after_steps: {args.function} must be greater than warmup_steps: {args.warmup_steps}")
-
-    warmup_steps = args.warmup_steps if args.warmup_steps else 0
-    flat_lr_steps = args.decay_after_steps - warmup_steps if args.decay_after_steps else 0
-    decay_lr_steps = args.total_iterations - flat_lr_steps
-
-    max_lr = args.lr
-    min_lr = args.min_lr
-
-    schedulers = []
-    milestones = []
-
-    if warmup_steps > 0:
-        if args.lr_warmup_method == "linear":
-            warmup_lr_scheduler = torch.optim.lr_scheduler.LinearLR(
-                optimizer, start_factor=args.lr_warmup_factor, total_iters=warmup_steps
-            )
-        elif args.lr_warmup_method == "constant":
-            warmup_lr_scheduler = torch.optim.lr_scheduler.ConstantLR(
-                optimizer, factor=args.lr_warmup_factor, total_iters=warmup_steps
-            )
-        else:
-            raise ValueError(f"Unknown lr warmup method {args.lr_warmup_method}")
-        schedulers.append(warmup_lr_scheduler)
-        milestones.append(warmup_steps)
-
-    if flat_lr_steps > 0:
-        flat_lr_scheduler = torch.optim.lr_scheduler.ConstantLR(optimizer, factor=max_lr, total_iters=flat_lr_steps)
-        schedulers.append(flat_lr_scheduler)
-        milestones.append(flat_lr_steps + warmup_steps)
-
-    if decay_lr_steps > 0:
-        if args.lr_decay_method == "cosine":
-            decay_lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
-                optimizer, T_max=decay_lr_steps, eta_min=min_lr
-            )
-        elif args.lr_decay_method == "linear":
-            decay_lr_scheduler = torch.optim.lr_scheduler.LinearLR(
-                optimizer, start_factor=max_lr, end_factor=min_lr, total_iters=decay_lr_steps
-            )
-        elif args.lr_decay_method == "exponential":
-            decay_lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(
-                optimizer, gamma=args.lr_decay_gamma, last_epoch=-1
-            )
-        else:
-            raise ValueError(f"Unknown lr decay method {args.lr_decay_method}")
-        schedulers.append(decay_lr_scheduler)
-
-    scheduler = torch.optim.lr_scheduler.SequentialLR(optimizer, schedulers, milestones=milestones)
-    return scheduler
-
-
-def shuffle_dataset(dataset):
-    """Shuffle the dataset"""
-    perm = torch.randperm(len(dataset))
-    return torch.utils.data.Subset(dataset, perm)
-
-
-def resize_dataset_to_n_steps(
-    dataset: torch.utils.data.Dataset, dataset_steps: int, samples_per_step: int, args: argparse.Namespace
-) -> torch.utils.data.Dataset:
-    original_size = len(dataset)
-    if args.steps_is_epochs:
-        samples_per_step = original_size
-    target_size = dataset_steps * samples_per_step
-
-    dataset_copies = []
-    n_expands, remainder = divmod(target_size, original_size)
-    for idx in range(n_expands):
-        dataset_copies.append(dataset)
-
-    if remainder > 0:
-        dataset_copies.append(torch.utils.data.Subset(dataset, list(range(remainder))))
-
-    if args.dataset_shuffle:
-        dataset_copies = [shuffle_dataset(dataset_copy) for dataset_copy in dataset_copies]
-
-    dataset = torch.utils.data.ConcatDataset(dataset_copies)
-    return dataset
-
-
-def get_train_dataset(dataset_root: str, args: argparse.Namespace) -> torch.utils.data.Dataset:
-    datasets = []
-    for dataset_name in args.train_datasets:
-        transform = make_train_transform(args)
-        dataset = make_dataset(dataset_name, dataset_root, transform)
-        datasets.append(dataset)
-
-    if len(datasets) == 0:
-        raise ValueError("No datasets specified for training")
-
-    samples_per_step = args.world_size * args.batch_size
-
-    for idx, (dataset, steps_per_dataset) in enumerate(zip(datasets, args.dataset_steps)):
-        datasets[idx] = resize_dataset_to_n_steps(dataset, steps_per_dataset, samples_per_step, args)
-
-    dataset = torch.utils.data.ConcatDataset(datasets)
-    if args.dataset_order_shuffle:
-        dataset = shuffle_dataset(dataset)
-
-    print(f"Training dataset: {len(dataset)} samples")
-    return dataset
-
-
-@torch.inference_mode()
-def _evaluate(
-    model,
-    args,
-    val_loader,
-    *,
-    padder_mode,
-    print_freq=10,
-    writter=None,
-    step=None,
-    iterations=None,
-    batch_size=None,
-    header=None,
-):
-    """Helper function to compute various metrics (epe, etc.) for a model on a given dataset."""
-    model.eval()
-    header = header or "Test:"
-    device = torch.device(args.device)
-    metric_logger = utils.MetricLogger(delimiter="  ")
-
-    iterations = iterations or args.recurrent_updates
-
-    logger = utils.MetricLogger()
-    for meter_name in args.metrics:
-        logger.add_meter(meter_name, fmt="{global_avg:.4f}")
-    if "fl-all" not in args.metrics:
-        logger.add_meter("fl-all", fmt="{global_avg:.4f}")
-
-    num_processed_samples = 0
-    with torch.cuda.amp.autocast(enabled=args.mixed_precision, dtype=torch.float16):
-        for blob in metric_logger.log_every(val_loader, print_freq, header):
-            image_left, image_right, disp_gt, valid_disp_mask = (x.to(device) for x in blob)
-            padder = utils.InputPadder(image_left.shape, mode=padder_mode)
-            image_left, image_right = padder.pad(image_left, image_right)
-
-            disp_predictions = model(image_left, image_right, flow_init=None, num_iters=iterations)
-            disp_pred = disp_predictions[-1][:, :1, :, :]
-            disp_pred = padder.unpad(disp_pred)
-
-            metrics, _ = utils.compute_metrics(disp_pred, disp_gt, valid_disp_mask, metrics=logger.meters.keys())
-            num_processed_samples += image_left.shape[0]
-            for name in metrics:
-                logger.meters[name].update(metrics[name], n=1)
-
-    num_processed_samples = utils.reduce_across_processes(num_processed_samples)
-
-    print("Num_processed_samples: ", num_processed_samples)
-    if (
-        hasattr(val_loader.dataset, "__len__")
-        and len(val_loader.dataset) != num_processed_samples
-        and torch.distributed.get_rank() == 0
-    ):
-        warnings.warn(
-            f"Number of processed samples {num_processed_samples} is different"
-            f"from the dataset size {len(val_loader.dataset)}. This may happen if"
-            "the dataset is not divisible by the batch size. Try lowering the batch size or GPU number for more accurate results."
-        )
-
-    if writter is not None and args.rank == 0:
-        for meter_name, meter_value in logger.meters.items():
-            scalar_name = f"{meter_name} {header}"
-            writter.add_scalar(scalar_name, meter_value.avg, step)
-
-    logger.synchronize_between_processes()
-    print(header, logger)
-
-
-def make_eval_loader(dataset_name: str, args: argparse.Namespace) -> torch.utils.data.DataLoader:
-    if args.weights:
-        weights = torchvision.models.get_weight(args.weights)
-        trans = weights.transforms()
-
-        def preprocessing(image_left, image_right, disp, valid_disp_mask):
-            C_o, H_o, W_o = get_dimensions(image_left)
-            image_left, image_right = trans(image_left, image_right)
-
-            C_t, H_t, W_t = get_dimensions(image_left)
-            scale_factor = W_t / W_o
-
-            if disp is not None and not isinstance(disp, torch.Tensor):
-                disp = torch.from_numpy(disp)
-                if W_t != W_o:
-                    disp = resize(disp, (H_t, W_t), mode=InterpolationMode.BILINEAR) * scale_factor
-            if valid_disp_mask is not None and not isinstance(valid_disp_mask, torch.Tensor):
-                valid_disp_mask = torch.from_numpy(valid_disp_mask)
-                if W_t != W_o:
-                    valid_disp_mask = resize(valid_disp_mask, (H_t, W_t), mode=InterpolationMode.NEAREST)
-            return image_left, image_right, disp, valid_disp_mask
-
-    else:
-        preprocessing = make_eval_transform(args)
-
-    val_dataset = make_dataset(dataset_name, args.dataset_root, transforms=preprocessing)
-    if args.distributed:
-        sampler = torch.utils.data.distributed.DistributedSampler(val_dataset, shuffle=False, drop_last=False)
-    else:
-        sampler = torch.utils.data.SequentialSampler(val_dataset)
-
-    val_loader = torch.utils.data.DataLoader(
-        val_dataset,
-        sampler=sampler,
-        batch_size=args.batch_size,
-        pin_memory=True,
-        num_workers=args.workers,
-    )
-
-    return val_loader
-
-
-def evaluate(model, loaders, args, writter=None, step=None):
-    for loader_name, loader in loaders.items():
-        _evaluate(
-            model,
-            args,
-            loader,
-            iterations=args.recurrent_updates,
-            padder_mode=args.padder_type,
-            header=f"{loader_name} evaluation",
-            batch_size=args.batch_size,
-            writter=writter,
-            step=step,
-        )
-
-
-def run(model, optimizer, scheduler, train_loader, val_loaders, logger, writer, scaler, args):
-    device = torch.device(args.device)
-    # wrap the loader in a logger
-    loader = iter(logger.log_every(train_loader))
-    # output channels
-    model_out_channels = model.module.output_channels if args.distributed else model.output_channels
-
-    torch.set_num_threads(args.threads)
-
-    sequence_criterion = utils.SequenceLoss(
-        gamma=args.gamma,
-        max_flow=args.max_disparity,
-        exclude_large_flows=args.flow_loss_exclude_large,
-    ).to(device)
-
-    if args.consistency_weight:
-        consistency_criterion = utils.FlowSequenceConsistencyLoss(
-            args.gamma,
-            resize_factor=0.25,
-            rescale_factor=0.25,
-            rescale_mode="bilinear",
-        ).to(device)
-    else:
-        consistency_criterion = None
-
-    if args.psnr_weight:
-        psnr_criterion = utils.PSNRLoss().to(device)
-    else:
-        psnr_criterion = None
-
-    if args.smoothness_weight:
-        smoothness_criterion = utils.SmoothnessLoss().to(device)
-    else:
-        smoothness_criterion = None
-
-    if args.photometric_weight:
-        photometric_criterion = utils.FlowPhotoMetricLoss(
-            ssim_weight=args.photometric_ssim_weight,
-            max_displacement_ratio=args.photometric_max_displacement_ratio,
-            ssim_use_padding=False,
-        ).to(device)
-    else:
-        photometric_criterion = None
-
-    for step in range(args.start_step + 1, args.total_iterations + 1):
-        data_blob = next(loader)
-        optimizer.zero_grad()
-
-        # unpack the data blob
-        image_left, image_right, disp_mask, valid_disp_mask = (x.to(device) for x in data_blob)
-        with torch.cuda.amp.autocast(enabled=args.mixed_precision, dtype=torch.float16):
-            disp_predictions = model(image_left, image_right, flow_init=None, num_iters=args.recurrent_updates)
-            # different models have different outputs, make sure we get the right ones for this task
-            disp_predictions = make_stereo_flow(disp_predictions, model_out_channels)
-            # should the architecture or training loop require it, we have to adjust the disparity mask
-            # target to possibly look like an optical flow mask
-            disp_mask = make_stereo_flow(disp_mask, model_out_channels)
-            # sequence loss on top of the model outputs
-
-        loss = sequence_criterion(disp_predictions, disp_mask, valid_disp_mask) * args.flow_loss_weight
-
-        if args.consistency_weight > 0:
-            loss_consistency = consistency_criterion(disp_predictions)
-            loss += loss_consistency * args.consistency_weight
-
-        if args.psnr_weight > 0:
-            loss_psnr = 0.0
-            for pred in disp_predictions:
-                # predictions might have 2 channels
-                loss_psnr += psnr_criterion(
-                    pred * valid_disp_mask.unsqueeze(1),
-                    disp_mask * valid_disp_mask.unsqueeze(1),
-                ).mean()  # mean the psnr loss over the batch
-            loss += loss_psnr / len(disp_predictions) * args.psnr_weight
-
-        if args.photometric_weight > 0:
-            loss_photometric = 0.0
-            for pred in disp_predictions:
-                # predictions might have 1 channel, therefore we need to inpute 0s for the second channel
-                if model_out_channels == 1:
-                    pred = torch.cat([pred, torch.zeros_like(pred)], dim=1)
-
-                loss_photometric += photometric_criterion(
-                    image_left, image_right, pred, valid_disp_mask
-                )  # photometric loss already comes out meaned over the batch
-            loss += loss_photometric / len(disp_predictions) * args.photometric_weight
-
-        if args.smoothness_weight > 0:
-            loss_smoothness = 0.0
-            for pred in disp_predictions:
-                # predictions might have 2 channels
-                loss_smoothness += smoothness_criterion(
-                    image_left, pred[:, :1, :, :]
-                ).mean()  # mean the smoothness loss over the batch
-            loss += loss_smoothness / len(disp_predictions) * args.smoothness_weight
-
-        with torch.no_grad():
-            metrics, _ = utils.compute_metrics(
-                disp_predictions[-1][:, :1, :, :],  # predictions might have 2 channels
-                disp_mask[:, :1, :, :],  # so does the ground truth
-                valid_disp_mask,
-                args.metrics,
-            )
-
-        metrics.pop("fl-all", None)
-        logger.update(loss=loss, **metrics)
-
-        if scaler is not None:
-            scaler.scale(loss).backward()
-            scaler.unscale_(optimizer)
-            if args.clip_grad_norm:
-                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=args.clip_grad_norm)
-            scaler.step(optimizer)
-            scaler.update()
-        else:
-            loss.backward()
-            if args.clip_grad_norm:
-                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=args.clip_grad_norm)
-            optimizer.step()
-
-        scheduler.step()
-
-        if not dist.is_initialized() or dist.get_rank() == 0:
-            if writer is not None and step % args.tensorboard_log_frequency == 0:
-                # log the loss and metrics to tensorboard
-
-                writer.add_scalar("loss", loss, step)
-                for name, value in logger.meters.items():
-                    writer.add_scalar(name, value.avg, step)
-                # log the images to tensorboard
-                pred_grid = vizualization.make_training_sample_grid(
-                    image_left, image_right, disp_mask, valid_disp_mask, disp_predictions
-                )
-                writer.add_image("predictions", pred_grid, step, dataformats="HWC")
-
-                # second thing we want to see is how relevant the iterative refinement is
-                pred_sequence_grid = vizualization.make_disparity_sequence_grid(disp_predictions, disp_mask)
-                writer.add_image("sequence", pred_sequence_grid, step, dataformats="HWC")
-
-        if step % args.save_frequency == 0:
-            if not args.distributed or args.rank == 0:
-                model_without_ddp = (
-                    model.module if isinstance(model, torch.nn.parallel.DistributedDataParallel) else model
-                )
-                checkpoint = {
-                    "model": model_without_ddp.state_dict(),
-                    "optimizer": optimizer.state_dict(),
-                    "scheduler": scheduler.state_dict(),
-                    "step": step,
-                    "args": args,
-                }
-                os.makedirs(args.checkpoint_dir, exist_ok=True)
-                torch.save(checkpoint, Path(args.checkpoint_dir) / f"{args.name}_{step}.pth")
-                torch.save(checkpoint, Path(args.checkpoint_dir) / f"{args.name}.pth")
-
-        if step % args.valid_frequency == 0:
-            evaluate(model, val_loaders, args, writer, step)
-            model.train()
-            if args.freeze_batch_norm:
-                if isinstance(model, nn.parallel.DistributedDataParallel):
-                    freeze_batch_norm(model.module)
-                else:
-                    freeze_batch_norm(model)
-
-    # one final save at the end
-    if not args.distributed or args.rank == 0:
-        model_without_ddp = model.module if isinstance(model, torch.nn.parallel.DistributedDataParallel) else model
-        checkpoint = {
-            "model": model_without_ddp.state_dict(),
-            "optimizer": optimizer.state_dict(),
-            "scheduler": scheduler.state_dict(),
-            "step": step,
-            "args": args,
-        }
-        os.makedirs(args.checkpoint_dir, exist_ok=True)
-        torch.save(checkpoint, Path(args.checkpoint_dir) / f"{args.name}_{step}.pth")
-        torch.save(checkpoint, Path(args.checkpoint_dir) / f"{args.name}.pth")
-
-
-def main(args):
-    args.total_iterations = sum(args.dataset_steps)
-
-    # intialize DDP setting
-    utils.setup_ddp(args)
-    print(args)
-
-    args.test_only = args.train_datasets is None
-
-    # set the appropiate devices
-    if args.distributed and args.device == "cpu":
-        raise ValueError("The device must be cuda if we want to run in distributed mode using torchrun")
-    device = torch.device(args.device)
-
-    # select model architecture
-    model = torchvision.prototype.models.depth.stereo.__dict__[args.model](weights=args.weights)
-
-    # convert to DDP if need be
-    if args.distributed:
-        model = model.to(args.gpu)
-        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
-        model_without_ddp = model.module
-    else:
-        model.to(device)
-        model_without_ddp = model
-
-    os.makedirs(args.checkpoint_dir, exist_ok=True)
-
-    val_loaders = {name: make_eval_loader(name, args) for name in args.test_datasets}
-
-    # EVAL ONLY configurations
-    if args.test_only:
-        evaluate(model, val_loaders, args)
-        return
-
-    # Sanity check for the parameter count
-    print(f"Parameter Count: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")
-
-    # Compose the training dataset
-    train_dataset = get_train_dataset(args.dataset_root, args)
-
-    # initialize the optimizer
-    if args.optimizer == "adam":
-        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
-    elif args.optimizer == "sgd":
-        optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, weight_decay=args.weight_decay, momentum=0.9)
-    else:
-        raise ValueError(f"Unknown optimizer {args.optimizer}. Please choose between adam and sgd")
-
-    # initialize the learning rate schedule
-    scheduler = make_lr_schedule(args, optimizer)
-
-    # load them from checkpoint if need
-    args.start_step = 0
-    if args.resume_path is not None:
-        checkpoint = torch.load(args.resume_path, map_location="cpu")
-        if "model" in checkpoint:
-            # this means the user requested to resume from a training checkpoint
-            model_without_ddp.load_state_dict(checkpoint["model"])
-            # this means the user wants to continue training from where it was left off
-            if args.resume_schedule:
-                optimizer.load_state_dict(checkpoint["optimizer"])
-                scheduler.load_state_dict(checkpoint["scheduler"])
-                args.start_step = checkpoint["step"] + 1
-                # modify starting point of the dat
-                sample_start_step = args.start_step * args.batch_size * args.world_size
-                train_dataset = train_dataset[sample_start_step:]
-
-        else:
-            # this means the user wants to finetune on top of a model state dict
-            # and that no other changes are required
-            model_without_ddp.load_state_dict(checkpoint)
-
-    torch.backends.cudnn.benchmark = True
-
-    # enable training mode
-    model.train()
-    if args.freeze_batch_norm:
-        freeze_batch_norm(model_without_ddp)
-
-    # put dataloader on top of the dataset
-    # make sure to disable shuffling since the dataset is already shuffled
-    # in order to guarantee quasi randomness whilst retaining a deterministic
-    # dataset consumption order
-    if args.distributed:
-        # the train dataset is preshuffled in order to respect the iteration order
-        sampler = torch.utils.data.distributed.DistributedSampler(train_dataset, shuffle=False, drop_last=True)
-    else:
-        # the train dataset is already shuffled so we can use a simple SequentialSampler
-        sampler = torch.utils.data.SequentialSampler(train_dataset)
-
-    train_loader = torch.utils.data.DataLoader(
-        train_dataset,
-        sampler=sampler,
-        batch_size=args.batch_size,
-        pin_memory=True,
-        num_workers=args.workers,
-    )
-
-    # intialize the logger
-    if args.tensorboard_summaries:
-        from torch.utils.tensorboard import SummaryWriter
-
-        tensorboard_path = Path(args.checkpoint_dir) / "tensorboard"
-        os.makedirs(tensorboard_path, exist_ok=True)
-
-        tensorboard_run = tensorboard_path / f"{args.name}"
-        writer = SummaryWriter(tensorboard_run)
-    else:
-        writer = None
-
-    logger = utils.MetricLogger(delimiter="  ")
-
-    scaler = torch.cuda.amp.GradScaler() if args.mixed_precision else None
-    # run the training loop
-    # this will perform optimization, respectively logging and saving checkpoints
-    # when need be
-    run(
-        model=model,
-        optimizer=optimizer,
-        scheduler=scheduler,
-        train_loader=train_loader,
-        val_loaders=val_loaders,
-        logger=logger,
-        writer=writer,
-        scaler=scaler,
-        args=args,
-    )
-
-
-def get_args_parser(add_help=True):
-    import argparse
-
-    parser = argparse.ArgumentParser(description="PyTorch Stereo Matching Training", add_help=add_help)
-    # checkpointing
-    parser.add_argument("--name", default="crestereo", help="name of the experiment")
-    parser.add_argument("--resume", type=str, default=None, help="from which checkpoint to resume")
-    parser.add_argument("--checkpoint-dir", type=str, default="checkpoints", help="path to the checkpoint directory")
-
-    # dataset
-    parser.add_argument("--dataset-root", type=str, default="", help="path to the dataset root directory")
-    parser.add_argument(
-        "--train-datasets",
-        type=str,
-        nargs="+",
-        default=["crestereo"],
-        help="dataset(s) to train on",
-        choices=list(VALID_DATASETS.keys()),
-    )
-    parser.add_argument(
-        "--dataset-steps", type=int, nargs="+", default=[300_000], help="number of steps for each dataset"
-    )
-    parser.add_argument(
-        "--steps-is-epochs", action="store_true", help="if set, dataset-steps are interpreted as epochs"
-    )
-    parser.add_argument(
-        "--test-datasets",
-        type=str,
-        nargs="+",
-        default=["middlebury2014-train"],
-        help="dataset(s) to test on",
-        choices=["middlebury2014-train"],
-    )
-    parser.add_argument("--dataset-shuffle", type=bool, help="shuffle the dataset", default=True)
-    parser.add_argument("--dataset-order-shuffle", type=bool, help="shuffle the dataset order", default=True)
-    parser.add_argument("--batch-size", type=int, default=2, help="batch size per GPU")
-    parser.add_argument("--workers", type=int, default=4, help="number of workers per GPU")
-    parser.add_argument(
-        "--threads",
-        type=int,
-        default=16,
-        help="number of CPU threads per GPU. This can be changed around to speed-up transforms if needed. This can lead to worker thread contention so use with care.",
-    )
-
-    # model architecture
-    parser.add_argument(
-        "--model",
-        type=str,
-        default="crestereo_base",
-        help="model architecture",
-        choices=["crestereo_base", "raft_stereo"],
-    )
-    parser.add_argument("--recurrent-updates", type=int, default=10, help="number of recurrent updates")
-    parser.add_argument("--freeze-batch-norm", action="store_true", help="freeze batch norm parameters")
-
-    # loss parameters
-    parser.add_argument("--gamma", type=float, default=0.8, help="gamma parameter for the flow sequence loss")
-    parser.add_argument("--flow-loss-weight", type=float, default=1.0, help="weight for the flow loss")
-    parser.add_argument(
-        "--flow-loss-exclude-large",
-        action="store_true",
-        help="exclude large flow values from the loss. A large value is defined as a value greater than the ground truth flow norm",
-        default=False,
-    )
-    parser.add_argument("--consistency-weight", type=float, default=0.0, help="consistency loss weight")
-    parser.add_argument(
-        "--consistency-resize-factor",
-        type=float,
-        default=0.25,
-        help="consistency loss resize factor to account for the fact that the flow is computed on a downsampled image",
-    )
-    parser.add_argument("--psnr-weight", type=float, default=0.0, help="psnr loss weight")
-    parser.add_argument("--smoothness-weight", type=float, default=0.0, help="smoothness loss weight")
-    parser.add_argument("--photometric-weight", type=float, default=0.0, help="photometric loss weight")
-    parser.add_argument(
-        "--photometric-max-displacement-ratio",
-        type=float,
-        default=0.15,
-        help="Only pixels with a displacement smaller than this ratio of the image width will be considered for the photometric loss",
-    )
-    parser.add_argument("--photometric-ssim-weight", type=float, default=0.85, help="photometric ssim loss weight")
-
-    # transforms parameters
-    parser.add_argument("--gpu-transforms", action="store_true", help="use GPU transforms")
-    parser.add_argument(
-        "--eval-size", type=int, nargs="+", default=[384, 512], help="size of the images for evaluation"
-    )
-    parser.add_argument("--resize-size", type=int, nargs=2, default=None, help="resize size")
-    parser.add_argument("--crop-size", type=int, nargs=2, default=[384, 512], help="crop size")
-    parser.add_argument("--scale-range", type=float, nargs=2, default=[0.6, 1.0], help="random scale range")
-    parser.add_argument("--rescale-prob", type=float, default=1.0, help="probability of resizing the image")
-    parser.add_argument(
-        "--scaling-type", type=str, default="linear", help="scaling type", choices=["exponential", "linear"]
-    )
-    parser.add_argument("--flip-prob", type=float, default=0.5, help="probability of flipping the image")
-    parser.add_argument(
-        "--norm-mean", type=float, nargs="+", default=[0.5, 0.5, 0.5], help="mean for image normalization"
-    )
-    parser.add_argument(
-        "--norm-std", type=float, nargs="+", default=[0.5, 0.5, 0.5], help="std for image normalization"
-    )
-    parser.add_argument(
-        "--use-grayscale", action="store_true", help="use grayscale images instead of RGB", default=False
-    )
-    parser.add_argument("--max-disparity", type=float, default=None, help="maximum disparity")
-    parser.add_argument(
-        "--interpolation-strategy",
-        type=str,
-        default="bilinear",
-        help="interpolation strategy",
-        choices=["bilinear", "bicubic", "mixed"],
-    )
-    parser.add_argument("--spatial-shift-prob", type=float, default=1.0, help="probability of shifting the image")
-    parser.add_argument(
-        "--spatial-shift-max-angle", type=float, default=0.1, help="maximum angle for the spatial shift"
-    )
-    parser.add_argument(
-        "--spatial-shift-max-displacement", type=float, default=2.0, help="maximum displacement for the spatial shift"
-    )
-    parser.add_argument("--gamma-range", type=float, nargs="+", default=[0.8, 1.2], help="range for gamma correction")
-    parser.add_argument(
-        "--brightness-range", type=float, nargs="+", default=[0.8, 1.2], help="range for brightness correction"
-    )
-    parser.add_argument(
-        "--contrast-range", type=float, nargs="+", default=[0.8, 1.2], help="range for contrast correction"
-    )
-    parser.add_argument(
-        "--saturation-range", type=float, nargs="+", default=0.0, help="range for saturation correction"
-    )
-    parser.add_argument("--hue-range", type=float, nargs="+", default=0.0, help="range for hue correction")
-    parser.add_argument(
-        "--asymmetric-jitter-prob",
-        type=float,
-        default=1.0,
-        help="probability of using asymmetric jitter instead of symmetric jitter",
-    )
-    parser.add_argument("--occlusion-prob", type=float, default=0.5, help="probability of occluding the rightimage")
-    parser.add_argument(
-        "--occlusion-px-range", type=int, nargs="+", default=[50, 100], help="range for the number of occluded pixels"
-    )
-    parser.add_argument("--erase-prob", type=float, default=0.0, help="probability of erasing in both images")
-    parser.add_argument(
-        "--erase-px-range", type=int, nargs="+", default=[50, 100], help="range for the number of erased pixels"
-    )
-    parser.add_argument(
-        "--erase-num-repeats", type=int, default=1, help="number of times to repeat the erase operation"
-    )
-
-    # optimizer parameters
-    parser.add_argument("--optimizer", type=str, default="adam", help="optimizer", choices=["adam", "sgd"])
-    parser.add_argument("--lr", type=float, default=4e-4, help="learning rate")
-    parser.add_argument("--weight-decay", type=float, default=0.0, help="weight decay")
-    parser.add_argument("--clip-grad-norm", type=float, default=0.0, help="clip grad norm")
-
-    # lr_scheduler parameters
-    parser.add_argument("--min-lr", type=float, default=2e-5, help="minimum learning rate")
-    parser.add_argument("--warmup-steps", type=int, default=6_000, help="number of warmup steps")
-    parser.add_argument(
-        "--decay-after-steps", type=int, default=180_000, help="number of steps after which to start decay the lr"
-    )
-    parser.add_argument(
-        "--lr-warmup-method", type=str, default="linear", help="warmup method", choices=["linear", "cosine"]
-    )
-    parser.add_argument("--lr-warmup-factor", type=float, default=0.02, help="warmup factor for the learning rate")
-    parser.add_argument(
-        "--lr-decay-method",
-        type=str,
-        default="linear",
-        help="decay method",
-        choices=["linear", "cosine", "exponential"],
-    )
-    parser.add_argument("--lr-decay-gamma", type=float, default=0.8, help="decay factor for the learning rate")
-
-    # deterministic behaviour
-    parser.add_argument("--seed", type=int, default=42, help="seed for random number generators")
-
-    # mixed precision training
-    parser.add_argument("--mixed-precision", action="store_true", help="use mixed precision training")
-
-    # logging
-    parser.add_argument("--tensorboard-summaries", action="store_true", help="log to tensorboard")
-    parser.add_argument("--tensorboard-log-frequency", type=int, default=100, help="log frequency")
-    parser.add_argument("--save-frequency", type=int, default=1_000, help="save frequency")
-    parser.add_argument("--valid-frequency", type=int, default=1_000, help="validation frequency")
-    parser.add_argument(
-        "--metrics",
-        type=str,
-        nargs="+",
-        default=["mae", "rmse", "1px", "3px", "5px", "relepe"],
-        help="metrics to log",
-        choices=AVAILABLE_METRICS,
-    )
-
-    # distributed parameters
-    parser.add_argument("--world-size", type=int, default=8, help="number of distributed processes")
-    parser.add_argument("--dist-url", type=str, default="env://", help="url used to set up distributed training")
-    parser.add_argument("--device", type=str, default="cuda", help="device to use for training")
-
-    # weights API
-    parser.add_argument("--weights", type=str, default=None, help="weights API url")
-    parser.add_argument(
-        "--resume-path", type=str, default=None, help="a path from which to resume or start fine-tuning"
-    )
-    parser.add_argument("--resume-schedule", action="store_true", help="resume optimizer state")
-
-    # padder parameters
-    parser.add_argument("--padder-type", type=str, default="kitti", help="padder type", choices=["kitti", "sintel"])
-    return parser
-
-
-if __name__ == "__main__":
-    args = get_args_parser().parse_args()
-    main(args)
--- a/references/depth/stereo/transforms.py
+++ b/references/depth/stereo/transforms.py
-import random
-from typing import Callable, List, Optional, Sequence, Tuple, Union
-
-import numpy as np
-import PIL.Image
-import torch
-import torchvision.transforms as T
-import torchvision.transforms.functional as F
-from torch import Tensor
-
-T_FLOW = Union[Tensor, np.ndarray, None]
-T_MASK = Union[Tensor, np.ndarray, None]
-T_STEREO_TENSOR = Tuple[Tensor, Tensor]
-T_COLOR_AUG_PARAM = Union[float, Tuple[float, float]]
-
-
-def rand_float_range(size: Sequence[int], low: float, high: float) -> Tensor:
-    return (low - high) * torch.rand(size) + high
-
-
-class InterpolationStrategy:
-
-    _valid_modes: List[str] = ["mixed", "bicubic", "bilinear"]
-
-    def __init__(self, mode: str = "mixed") -> None:
-        if mode not in self._valid_modes:
-            raise ValueError(f"Invalid interpolation mode: {mode}. Valid modes are: {self._valid_modes}")
-
-        if mode == "mixed":
-            self.strategies = [F.InterpolationMode.BILINEAR, F.InterpolationMode.BICUBIC]
-        elif mode == "bicubic":
-            self.strategies = [F.InterpolationMode.BICUBIC]
-        elif mode == "bilinear":
-            self.strategies = [F.InterpolationMode.BILINEAR]
-
-    def __call__(self) -> F.InterpolationMode:
-        return random.choice(self.strategies)
-
-    @classmethod
-    def is_valid(mode: str) -> bool:
-        return mode in InterpolationStrategy._valid_modes
-
-    @property
-    def valid_modes() -> List[str]:
-        return InterpolationStrategy._valid_modes
-
-
-class ValidateModelInput(torch.nn.Module):
-    # Pass-through transform that checks the shape and dtypes to make sure the model gets what it expects
-    def forward(self, images: T_STEREO_TENSOR, disparities: T_FLOW, masks: T_MASK):
-        if images[0].shape != images[1].shape:
-            raise ValueError("img1 and img2 should have the same shape.")
-        h, w = images[0].shape[-2:]
-        if disparities[0] is not None and disparities[0].shape != (1, h, w):
-            raise ValueError(f"disparities[0].shape should be (1, {h}, {w}) instead of {disparities[0].shape}")
-        if masks[0] is not None:
-            if masks[0].shape != (h, w):
-                raise ValueError(f"masks[0].shape should be ({h}, {w}) instead of {masks[0].shape}")
-            if masks[0].dtype != torch.bool:
-                raise TypeError(f"masks[0] should be of dtype torch.bool instead of {masks[0].dtype}")
-
-        return images, disparities, masks
-
-
-class ConvertToGrayscale(torch.nn.Module):
-    def __init__(self) -> None:
-        super().__init__()
-
-    def forward(
-        self,
-        images: Tuple[PIL.Image.Image, PIL.Image.Image],
-        disparities: Tuple[T_FLOW, T_FLOW],
-        masks: Tuple[T_MASK, T_MASK],
-    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
-        img_left = F.rgb_to_grayscale(images[0], num_output_channels=3)
-        img_right = F.rgb_to_grayscale(images[1], num_output_channels=3)
-
-        return (img_left, img_right), disparities, masks
-
-
-class MakeValidDisparityMask(torch.nn.Module):
-    def __init__(self, max_disparity: Optional[int] = 256) -> None:
-        super().__init__()
-        self.max_disparity = max_disparity
-
-    def forward(
-        self,
-        images: T_STEREO_TENSOR,
-        disparities: Tuple[T_FLOW, T_FLOW],
-        masks: Tuple[T_MASK, T_MASK],
-    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
-        valid_masks = tuple(
-            torch.ones(images[idx].shape[-2:], dtype=torch.bool, device=images[idx].device) if mask is None else mask
-            for idx, mask in enumerate(masks)
-        )
-
-        valid_masks = tuple(
-            torch.logical_and(mask, disparity > 0).squeeze(0) if disparity is not None else mask
-            for mask, disparity in zip(valid_masks, disparities)
-        )
-
-        if self.max_disparity is not None:
-            valid_masks = tuple(
-                torch.logical_and(mask, disparity < self.max_disparity).squeeze(0) if disparity is not None else mask
-                for mask, disparity in zip(valid_masks, disparities)
-            )
-
-        return images, disparities, valid_masks
-
-
-class ToGPU(torch.nn.Module):
-    def __init__(self) -> None:
-        super().__init__()
-
-    def forward(
-        self,
-        images: T_STEREO_TENSOR,
-        disparities: Tuple[T_FLOW, T_FLOW],
-        masks: Tuple[T_MASK, T_MASK],
-    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
-        dev_images = tuple(image.cuda() for image in images)
-        dev_disparities = tuple(map(lambda x: x.cuda() if x is not None else None, disparities))
-        dev_masks = tuple(map(lambda x: x.cuda() if x is not None else None, masks))
-        return dev_images, dev_disparities, dev_masks
-
-
-class ConvertImageDtype(torch.nn.Module):
-    def __init__(self, dtype: torch.dtype):
-        super().__init__()
-        self.dtype = dtype
-
-    def forward(
-        self,
-        images: T_STEREO_TENSOR,
-        disparities: Tuple[T_FLOW, T_FLOW],
-        masks: Tuple[T_MASK, T_MASK],
-    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
-        img_left = F.convert_image_dtype(images[0], dtype=self.dtype)
-        img_right = F.convert_image_dtype(images[1], dtype=self.dtype)
-
-        img_left = img_left.contiguous()
-        img_right = img_right.contiguous()
-
-        return (img_left, img_right), disparities, masks
-
-
-class Normalize(torch.nn.Module):
-    def __init__(self, mean: List[float], std: List[float]) -> None:
-        super().__init__()
-        self.mean = mean
-        self.std = std
-
-    def forward(
-        self,
-        images: T_STEREO_TENSOR,
-        disparities: Tuple[T_FLOW, T_FLOW],
-        masks: Tuple[T_MASK, T_MASK],
-    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
-
-        img_left = F.normalize(images[0], mean=self.mean, std=self.std)
-        img_right = F.normalize(images[1], mean=self.mean, std=self.std)
-
-        img_left = img_left.contiguous()
-        img_right = img_right.contiguous()
-
-        return (img_left, img_right), disparities, masks
-
-
-class ToTensor(torch.nn.Module):
-    def forward(
-        self,
-        images: Tuple[PIL.Image.Image, PIL.Image.Image],
-        disparities: Tuple[T_FLOW, T_FLOW],
-        masks: Tuple[T_MASK, T_MASK],
-    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
-        if images[0] is None:
-            raise ValueError("img_left is None")
-        if images[1] is None:
-            raise ValueError("img_right is None")
-
-        img_left = F.pil_to_tensor(images[0])
-        img_right = F.pil_to_tensor(images[1])
-        disparity_tensors = ()
-        mask_tensors = ()
-
-        for idx in range(2):
-            disparity_tensors += (torch.from_numpy(disparities[idx]),) if disparities[idx] is not None else (None,)
-            mask_tensors += (torch.from_numpy(masks[idx]),) if masks[idx] is not None else (None,)
-
-        return (img_left, img_right), disparity_tensors, mask_tensors
-
-
-class AsymmetricColorJitter(T.ColorJitter):
-    # p determines the probability of doing asymmetric vs symmetric color jittering
-    def __init__(
-        self,
-        brightness: T_COLOR_AUG_PARAM = 0,
-        contrast: T_COLOR_AUG_PARAM = 0,
-        saturation: T_COLOR_AUG_PARAM = 0,
-        hue: T_COLOR_AUG_PARAM = 0,
-        p: float = 0.2,
-    ):
-        super().__init__(brightness=brightness, contrast=contrast, saturation=saturation, hue=hue)
-        self.p = p
-
-    def forward(
-        self,
-        images: T_STEREO_TENSOR,
-        disparities: Tuple[T_FLOW, T_FLOW],
-        masks: Tuple[T_MASK, T_MASK],
-    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
-
-        if torch.rand(1) < self.p:
-            # asymmetric: different transform for img1 and img2
-            img_left = super().forward(images[0])
-            img_right = super().forward(images[1])
-        else:
-            # symmetric: same transform for img1 and img2
-            batch = torch.stack(images)
-            batch = super().forward(batch)
-            img_left, img_right = batch[0], batch[1]
-
-        return (img_left, img_right), disparities, masks
-
-
-class AsymetricGammaAdjust(torch.nn.Module):
-    def __init__(self, p: float, gamma_range: Tuple[float, float], gain: float = 1) -> None:
-        super().__init__()
-        self.gamma_range = gamma_range
-        self.gain = gain
-        self.p = p
-
-    def forward(
-        self,
-        images: T_STEREO_TENSOR,
-        disparities: Tuple[T_FLOW, T_FLOW],
-        masks: Tuple[T_MASK, T_MASK],
-    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
-
-        gamma = rand_float_range((1,), low=self.gamma_range[0], high=self.gamma_range[1]).item()
-
-        if torch.rand(1) < self.p:
-            # asymmetric: different transform for img1 and img2
-            img_left = F.adjust_gamma(images[0], gamma, gain=self.gain)
-            img_right = F.adjust_gamma(images[1], gamma, gain=self.gain)
-        else:
-            # symmetric: same transform for img1 and img2
-            batch = torch.stack(images)
-            batch = F.adjust_gamma(batch, gamma, gain=self.gain)
-            img_left, img_right = batch[0], batch[1]
-
-        return (img_left, img_right), disparities, masks
-
-
-class RandomErase(torch.nn.Module):
-    # Produces multiple symetric random erasures
-    # these can be viewed as occlusions present in both camera views.
-    # Similarly to Optical Flow occlusion prediction tasks, we mask these pixels in the disparity map
-    def __init__(
-        self,
-        p: float = 0.5,
-        erase_px_range: Tuple[int, int] = (50, 100),
-        value: Union[Tensor, float] = 0,
-        inplace: bool = False,
-        max_erase: int = 2,
-    ):
-        super().__init__()
-        self.min_px_erase = erase_px_range[0]
-        self.max_px_erase = erase_px_range[1]
-        if self.max_px_erase < 0:
-            raise ValueError("erase_px_range[1] should be equal or greater than 0")
-        if self.min_px_erase < 0:
-            raise ValueError("erase_px_range[0] should be equal or greater than 0")
-        if self.min_px_erase > self.max_px_erase:
-            raise ValueError("erase_prx_range[0] should be equal or lower than erase_px_range[1]")
-
-        self.p = p
-        self.value = value
-        self.inplace = inplace
-        self.max_erase = max_erase
-
-    def forward(
-        self,
-        images: T_STEREO_TENSOR,
-        disparities: T_STEREO_TENSOR,
-        masks: T_STEREO_TENSOR,
-    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
-
-        if torch.rand(1) < self.p:
-            return images, disparities, masks
-
-        image_left, image_right = images
-        mask_left, mask_right = masks
-        for _ in range(torch.randint(self.max_erase, size=(1,)).item()):
-            y, x, h, w, v = self._get_params(image_left)
-            image_right = F.erase(image_right, y, x, h, w, v, self.inplace)
-            image_left = F.erase(image_left, y, x, h, w, v, self.inplace)
-            # similarly to optical flow occlusion prediction, we consider
-            # any erasure pixels that are in both images to be occluded therefore
-            # we mark them as invalid
-            if mask_left is not None:
-                mask_left = F.erase(mask_left, y, x, h, w, False, self.inplace)
-            if mask_right is not None:
-                mask_right = F.erase(mask_right, y, x, h, w, False, self.inplace)
-
-        return (image_left, image_right), disparities, (mask_left, mask_right)
-
-    def _get_params(self, img: torch.Tensor) -> Tuple[int, int, int, int, float]:
-        img_h, img_w = img.shape[-2:]
-        crop_h, crop_w = (
-            random.randint(self.min_px_erase, self.max_px_erase),
-            random.randint(self.min_px_erase, self.max_px_erase),
-        )
-        crop_x, crop_y = (random.randint(0, img_w - crop_w), random.randint(0, img_h - crop_h))
-
-        return crop_y, crop_x, crop_h, crop_w, self.value
-
-
-class RandomOcclusion(torch.nn.Module):
-    # This adds an occlusion in the right image
-    # the occluded patch works as a patch erase where the erase value is the mean
-    # of the pixels from the selected zone
-    def __init__(self, p: float = 0.5, occlusion_px_range: Tuple[int, int] = (50, 100), inplace: bool = False):
-        super().__init__()
-
-        self.min_px_occlusion = occlusion_px_range[0]
-        self.max_px_occlusion = occlusion_px_range[1]
-
-        if self.max_px_occlusion < 0:
-            raise ValueError("occlusion_px_range[1] should be greater or equal than 0")
-        if self.min_px_occlusion < 0:
-            raise ValueError("occlusion_px_range[0] should be greater or equal than 0")
-        if self.min_px_occlusion > self.max_px_occlusion:
-            raise ValueError("occlusion_px_range[0] should be lower than occlusion_px_range[1]")
-
-        self.p = p
-        self.inplace = inplace
-
-    def forward(
-        self,
-        images: T_STEREO_TENSOR,
-        disparities: T_STEREO_TENSOR,
-        masks: T_STEREO_TENSOR,
-    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
-
-        left_image, right_image = images
-
-        if torch.rand(1) < self.p:
-            return images, disparities, masks
-
-        y, x, h, w, v = self._get_params(right_image)
-        right_image = F.erase(right_image, y, x, h, w, v, self.inplace)
-
-        return ((left_image, right_image), disparities, masks)
-
-    def _get_params(self, img: torch.Tensor) -> Tuple[int, int, int, int, float]:
-        img_h, img_w = img.shape[-2:]
-        crop_h, crop_w = (
-            random.randint(self.min_px_occlusion, self.max_px_occlusion),
-            random.randint(self.min_px_occlusion, self.max_px_occlusion),
-        )
-
-        crop_x, crop_y = (random.randint(0, img_w - crop_w), random.randint(0, img_h - crop_h))
-        occlusion_value = img[..., crop_y : crop_y + crop_h, crop_x : crop_x + crop_w].mean(dim=(-2, -1), keepdim=True)
-
-        return (crop_y, crop_x, crop_h, crop_w, occlusion_value)
-
-
-class RandomSpatialShift(torch.nn.Module):
-    # This transform applies a vertical shift and a slight angle rotation and the same time
-    def __init__(
-        self, p: float = 0.5, max_angle: float = 0.1, max_px_shift: int = 2, interpolation_type: str = "bilinear"
-    ) -> None:
-        super().__init__()
-        self.p = p
-        self.max_angle = max_angle
-        self.max_px_shift = max_px_shift
-        self._interpolation_mode_strategy = InterpolationStrategy(interpolation_type)
-
-    def forward(
-        self,
-        images: T_STEREO_TENSOR,
-        disparities: T_STEREO_TENSOR,
-        masks: T_STEREO_TENSOR,
-    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
-        # the transform is applied only on the right image
-        # in order to mimic slight calibration issues
-        img_left, img_right = images
-
-        INTERP_MODE = self._interpolation_mode_strategy()
-
-        if torch.rand(1) < self.p:
-            # [0, 1] -> [-a, a]
-            shift = rand_float_range((1,), low=-self.max_px_shift, high=self.max_px_shift).item()
-            angle = rand_float_range((1,), low=-self.max_angle, high=self.max_angle).item()
-            # sample center point for the rotation matrix
-            y = torch.randint(size=(1,), low=0, high=img_right.shape[-2]).item()
-            x = torch.randint(size=(1,), low=0, high=img_right.shape[-1]).item()
-            # apply affine transformations
-            img_right = F.affine(
-                img_right,
-                angle=angle,
-                translate=[0, shift],  # translation only on the y axis
-                center=[x, y],
-                scale=1.0,
-                shear=0.0,
-                interpolation=INTERP_MODE,
-            )
-
-        return ((img_left, img_right), disparities, masks)
-
-
-class RandomHorizontalFlip(torch.nn.Module):
-    def __init__(self, p: float = 0.5) -> None:
-        super().__init__()
-        self.p = p
-
-    def forward(
-        self,
-        images: T_STEREO_TENSOR,
-        disparities: Tuple[T_FLOW, T_FLOW],
-        masks: Tuple[T_MASK, T_MASK],
-    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
-
-        img_left, img_right = images
-        dsp_left, dsp_right = disparities
-        mask_left, mask_right = masks
-
-        if dsp_right is not None and torch.rand(1) < self.p:
-            img_left, img_right = F.hflip(img_left), F.hflip(img_right)
-            dsp_left, dsp_right = F.hflip(dsp_left), F.hflip(dsp_right)
-            if mask_left is not None and mask_right is not None:
-                mask_left, mask_right = F.hflip(mask_left), F.hflip(mask_right)
-            return ((img_right, img_left), (dsp_right, dsp_left), (mask_right, mask_left))
-
-        return images, disparities, masks
-
-
-class Resize(torch.nn.Module):
-    def __init__(self, resize_size: Tuple[int, ...], interpolation_type: str = "bilinear") -> None:
-        super().__init__()
-        self.resize_size = list(resize_size)  # doing this to keep mypy happy
-        self._interpolation_mode_strategy = InterpolationStrategy(interpolation_type)
-
-    def forward(
-        self,
-        images: T_STEREO_TENSOR,
-        disparities: Tuple[T_FLOW, T_FLOW],
-        masks: Tuple[T_MASK, T_MASK],
-    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
-        resized_images = ()
-        resized_disparities = ()
-        resized_masks = ()
-
-        INTERP_MODE = self._interpolation_mode_strategy()
-
-        for img in images:
-            resized_images += (F.resize(img, self.resize_size, interpolation=INTERP_MODE),)
-
-        for dsp in disparities:
-            if dsp is not None:
-                # rescale disparity to match the new image size
-                scale_x = self.resize_size[1] / dsp.shape[-1]
-                resized_disparities += (F.resize(dsp, self.resize_size, interpolation=INTERP_MODE) * scale_x,)
-            else:
-                resized_disparities += (None,)
-
-        for mask in masks:
-            if mask is not None:
-                resized_masks += (
-                    # we squeeze and unsqueeze because the API requires > 3D tensors
-                    F.resize(
-                        mask.unsqueeze(0),
-                        self.resize_size,
-                        interpolation=F.InterpolationMode.NEAREST,
-                    ).squeeze(0),
-                )
-            else:
-                resized_masks += (None,)
-
-        return resized_images, resized_disparities, resized_masks
-
-
-class RandomRescaleAndCrop(torch.nn.Module):
-    # This transform will resize the input with a given proba, and then crop it.
-    # These are the reversed operations of the built-in RandomResizedCrop,
-    # although the order of the operations doesn't matter too much: resizing a
-    # crop would give the same result as cropping a resized image, up to
-    # interpolation artifact at the borders of the output.
-    #
-    # The reason we don't rely on RandomResizedCrop is because of a significant
-    # difference in the parametrization of both transforms, in particular,
-    # because of the way the random parameters are sampled in both transforms,
-    # which leads to fairly different resuts (and different epe). For more details see
-    # https://github.com/pytorch/vision/pull/5026/files#r762932579
-    def __init__(
-        self,
-        crop_size: Tuple[int, int],
-        scale_range: Tuple[float, float] = (-0.2, 0.5),
-        rescale_prob: float = 0.8,
-        scaling_type: str = "exponential",
-        interpolation_type: str = "bilinear",
-    ) -> None:
-        super().__init__()
-        self.crop_size = crop_size
-        self.min_scale = scale_range[0]
-        self.max_scale = scale_range[1]
-        self.rescale_prob = rescale_prob
-        self.scaling_type = scaling_type
-        self._interpolation_mode_strategy = InterpolationStrategy(interpolation_type)
-
-        if self.scaling_type == "linear" and self.min_scale < 0:
-            raise ValueError("min_scale must be >= 0 for linear scaling")
-
-    def forward(
-        self,
-        images: T_STEREO_TENSOR,
-        disparities: Tuple[T_FLOW, T_FLOW],
-        masks: Tuple[T_MASK, T_MASK],
-    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
-
-        img_left, img_right = images
-        dsp_left, dsp_right = disparities
-        mask_left, mask_right = masks
-        INTERP_MODE = self._interpolation_mode_strategy()
-
-        # randomly sample scale
-        h, w = img_left.shape[-2:]
-        # Note: in original code, they use + 1 instead of + 8 for sparse datasets (e.g. Kitti)
-        # It shouldn't matter much
-        min_scale = max((self.crop_size[0] + 8) / h, (self.crop_size[1] + 8) / w)
-
-        # exponential scaling will draw a random scale in (min_scale, max_scale) and then raise
-        # 2 to the power of that random value. This final scale distribution will have a different
-        # mean and variance than a uniform distribution. Note that a scale of 1 will result in
-        # in a rescaling of 2X the original size, whereas a scale of -1 will result in a rescaling
-        # of 0.5X the original size.
-        if self.scaling_type == "exponential":
-            scale = 2 ** torch.empty(1, dtype=torch.float32).uniform_(self.min_scale, self.max_scale).item()
-        # linear scaling will draw a random scale in (min_scale, max_scale)
-        elif self.scaling_type == "linear":
-            scale = torch.empty(1, dtype=torch.float32).uniform_(self.min_scale, self.max_scale).item()
-
-        scale = max(scale, min_scale)
-
-        new_h, new_w = round(h * scale), round(w * scale)
-
-        if torch.rand(1).item() < self.rescale_prob:
-            # rescale the images
-            img_left = F.resize(img_left, size=(new_h, new_w), interpolation=INTERP_MODE)
-            img_right = F.resize(img_right, size=(new_h, new_w), interpolation=INTERP_MODE)
-
-            resized_masks, resized_disparities = (), ()
-
-            for disparity, mask in zip(disparities, masks):
-                if disparity is not None:
-                    if mask is None:
-                        resized_disparity = F.resize(disparity, size=(new_h, new_w), interpolation=INTERP_MODE)
-                        # rescale the disparity
-                        resized_disparity = (
-                            resized_disparity * torch.tensor([scale], device=resized_disparity.device)[:, None, None]
-                        )
-                        resized_mask = None
-                    else:
-                        resized_disparity, resized_mask = _resize_sparse_flow(
-                            disparity, mask, scale_x=scale, scale_y=scale
-                        )
-                resized_masks += (resized_mask,)
-                resized_disparities += (resized_disparity,)
-
-        else:
-            resized_disparities = disparities
-            resized_masks = masks
-
-        disparities = resized_disparities
-        masks = resized_masks
-
-        # Note: For sparse datasets (Kitti), the original code uses a "margin"
-        # See e.g. https://github.com/princeton-vl/RAFT/blob/master/core/utils/augmentor.py#L220:L220
-        # We don't, not sure it matters much
-        y0 = torch.randint(0, img_left.shape[1] - self.crop_size[0], size=(1,)).item()
-        x0 = torch.randint(0, img_right.shape[2] - self.crop_size[1], size=(1,)).item()
-
-        img_left = F.crop(img_left, y0, x0, self.crop_size[0], self.crop_size[1])
-        img_right = F.crop(img_right, y0, x0, self.crop_size[0], self.crop_size[1])
-        if dsp_left is not None:
-            dsp_left = F.crop(disparities[0], y0, x0, self.crop_size[0], self.crop_size[1])
-        if dsp_right is not None:
-            dsp_right = F.crop(disparities[1], y0, x0, self.crop_size[0], self.crop_size[1])
-
-        cropped_masks = ()
-        for mask in masks:
-            if mask is not None:
-                mask = F.crop(mask, y0, x0, self.crop_size[0], self.crop_size[1])
-            cropped_masks += (mask,)
-
-        return ((img_left, img_right), (dsp_left, dsp_right), cropped_masks)
-
-
-def _resize_sparse_flow(
-    flow: Tensor, valid_flow_mask: Tensor, scale_x: float = 1.0, scale_y: float = 0.0
-) -> Tuple[Tensor, Tensor]:
-    # This resizes both the flow and the valid_flow_mask mask (which is assumed to be reasonably sparse)
-    # There are as-many non-zero values in the original flow as in the resized flow (up to OOB)
-    # So for example if scale_x = scale_y = 2, the sparsity of the output flow is multiplied by 4
-
-    h, w = flow.shape[-2:]
-
-    h_new = int(round(h * scale_y))
-    w_new = int(round(w * scale_x))
-    flow_new = torch.zeros(size=[1, h_new, w_new], dtype=flow.dtype)
-    valid_new = torch.zeros(size=[h_new, w_new], dtype=valid_flow_mask.dtype)
-
-    jj, ii = torch.meshgrid(torch.arange(w), torch.arange(h), indexing="xy")
-
-    ii_valid, jj_valid = ii[valid_flow_mask], jj[valid_flow_mask]
-
-    ii_valid_new = torch.round(ii_valid.to(float) * scale_y).to(torch.long)
-    jj_valid_new = torch.round(jj_valid.to(float) * scale_x).to(torch.long)
-
-    within_bounds_mask = (0 <= ii_valid_new) & (ii_valid_new < h_new) & (0 <= jj_valid_new) & (jj_valid_new < w_new)
-
-    ii_valid = ii_valid[within_bounds_mask]
-    jj_valid = jj_valid[within_bounds_mask]
-    ii_valid_new = ii_valid_new[within_bounds_mask]
-    jj_valid_new = jj_valid_new[within_bounds_mask]
-
-    valid_flow_new = flow[:, ii_valid, jj_valid]
-    valid_flow_new *= scale_x
-
-    flow_new[:, ii_valid_new, jj_valid_new] = valid_flow_new
-    valid_new[ii_valid_new, jj_valid_new] = valid_flow_mask[ii_valid, jj_valid]
-
-    return flow_new, valid_new.bool()
-
-
-class Compose(torch.nn.Module):
-    def __init__(self, transforms: List[Callable]):
-        super().__init__()
-        self.transforms = transforms
-
-    @torch.inference_mode()
-    def forward(self, images, disparities, masks):
-        for t in self.transforms:
-            images, disparities, masks = t(images, disparities, masks)
-        return images, disparities, masks
--- a/references/depth/stereo/utils/__init__.py
+++ b/references/depth/stereo/utils/__init__.py
-from .losses import *
-from .metrics import *
-from .distributed import *
-from .logger import *
-from .padder import *
-from .norm import *
--- a/references/depth/stereo/utils/distributed.py
+++ b/references/depth/stereo/utils/distributed.py
-import os
-
-import torch
-import torch.distributed as dist
-
-
-def _redefine_print(is_main):
-    """disables printing when not in main process"""
-    import builtins as __builtin__
-
-    builtin_print = __builtin__.print
-
-    def print(*args, **kwargs):
-        force = kwargs.pop("force", False)
-        if is_main or force:
-            builtin_print(*args, **kwargs)
-
-    __builtin__.print = print
-
-
-def setup_ddp(args):
-    # Set the local_rank, rank, and world_size values as args fields
-    # This is done differently depending on how we're running the script. We
-    # currently support either torchrun or the custom run_with_submitit.py
-    # If you're confused (like I was), this might help a bit
-    # https://discuss.pytorch.org/t/what-is-the-difference-between-rank-and-local-rank/61940/2
-
-    if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
-        args.rank = int(os.environ["RANK"])
-        args.world_size = int(os.environ["WORLD_SIZE"])
-        args.gpu = int(os.environ["LOCAL_RANK"])
-    elif "SLURM_PROCID" in os.environ:
-        args.rank = int(os.environ["SLURM_PROCID"])
-        args.gpu = args.rank % torch.cuda.device_count()
-    elif hasattr(args, "rank"):
-        pass
-    else:
-        print("Not using distributed mode")
-        args.distributed = False
-        args.world_size = 1
-        return
-
-    args.distributed = True
-
-    torch.cuda.set_device(args.gpu)
-    dist.init_process_group(
-        backend="nccl",
-        rank=args.rank,
-        world_size=args.world_size,
-        init_method=args.dist_url,
-    )
-    torch.distributed.barrier()
-    _redefine_print(is_main=(args.rank == 0))
-
-
-def reduce_across_processes(val):
-    t = torch.tensor(val, device="cuda")
-    dist.barrier()
-    dist.all_reduce(t)
-    return t
--- a/references/depth/stereo/utils/logger.py
+++ b/references/depth/stereo/utils/logger.py
-import datetime
-import time
-from collections import defaultdict, deque
-
-import torch
-
-from .distributed import reduce_across_processes
-
-
-class SmoothedValue:
-    """Track a series of values and provide access to smoothed values over a
-    window or the global series average.
-    """
-
-    def __init__(self, window_size=20, fmt="{median:.4f} ({global_avg:.4f})"):
-        self.deque = deque(maxlen=window_size)
-        self.total = 0.0
-        self.count = 0
-        self.fmt = fmt
-
-    def update(self, value, n=1):
-        self.deque.append(value)
-        self.count += n
-        self.total += value * n
-
-    def synchronize_between_processes(self):
-        """
-        Warning: does not synchronize the deque!
-        """
-        t = reduce_across_processes([self.count, self.total])
-        t = t.tolist()
-        self.count = int(t[0])
-        self.total = t[1]
-
-    @property
-    def median(self):
-        d = torch.tensor(list(self.deque))
-        return d.median().item()
-
-    @property
-    def avg(self):
-        d = torch.tensor(list(self.deque), dtype=torch.float32)
-        return d.mean().item()
-
-    @property
-    def global_avg(self):
-        return self.total / self.count
-
-    @property
-    def max(self):
-        return max(self.deque)
-
-    @property
-    def value(self):
-        return self.deque[-1]
-
-    def __str__(self):
-        return self.fmt.format(
-            median=self.median, avg=self.avg, global_avg=self.global_avg, max=self.max, value=self.value
-        )
-
-
-class MetricLogger:
-    def __init__(self, delimiter="\t"):
-        self.meters = defaultdict(SmoothedValue)
-        self.delimiter = delimiter
-
-    def update(self, **kwargs):
-        for k, v in kwargs.items():
-            if isinstance(v, torch.Tensor):
-                v = v.item()
-            if not isinstance(v, (float, int)):
-                raise TypeError(
-                    f"This method expects the value of the input arguments to be of type float or int, instead  got {type(v)}"
-                )
-            self.meters[k].update(v)
-
-    def __getattr__(self, attr):
-        if attr in self.meters:
-            return self.meters[attr]
-        if attr in self.__dict__:
-            return self.__dict__[attr]
-        raise AttributeError(f"'{type(self).__name__}' object has no attribute '{attr}'")
-
-    def __str__(self):
-        loss_str = []
-        for name, meter in self.meters.items():
-            loss_str.append(f"{name}: {str(meter)}")
-        return self.delimiter.join(loss_str)
-
-    def synchronize_between_processes(self):
-        for meter in self.meters.values():
-            meter.synchronize_between_processes()
-
-    def add_meter(self, name, **kwargs):
-        self.meters[name] = SmoothedValue(**kwargs)
-
-    def log_every(self, iterable, print_freq=5, header=None):
-        i = 0
-        if not header:
-            header = ""
-        start_time = time.time()
-        end = time.time()
-        iter_time = SmoothedValue(fmt="{avg:.4f}")
-        data_time = SmoothedValue(fmt="{avg:.4f}")
-        space_fmt = ":" + str(len(str(len(iterable)))) + "d"
-        if torch.cuda.is_available():
-            log_msg = self.delimiter.join(
-                [
-                    header,
-                    "[{0" + space_fmt + "}/{1}]",
-                    "eta: {eta}",
-                    "{meters}",
-                    "time: {time}",
-                    "data: {data}",
-                    "max mem: {memory:.0f}",
-                ]
-            )
-        else:
-            log_msg = self.delimiter.join(
-                [header, "[{0" + space_fmt + "}/{1}]", "eta: {eta}", "{meters}", "time: {time}", "data: {data}"]
-            )
-        MB = 1024.0 * 1024.0
-        for obj in iterable:
-            data_time.update(time.time() - end)
-            yield obj
-            iter_time.update(time.time() - end)
-            if print_freq is not None and i % print_freq == 0:
-                eta_seconds = iter_time.global_avg * (len(iterable) - i)
-                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
-                if torch.cuda.is_available():
-                    print(
-                        log_msg.format(
-                            i,
-                            len(iterable),
-                            eta=eta_string,
-                            meters=str(self),
-                            time=str(iter_time),
-                            data=str(data_time),
-                            memory=torch.cuda.max_memory_allocated() / MB,
-                        )
-                    )
-                else:
-                    print(
-                        log_msg.format(
-                            i, len(iterable), eta=eta_string, meters=str(self), time=str(iter_time), data=str(data_time)
-                        )
-                    )
-            i += 1
-            end = time.time()
-        total_time = time.time() - start_time
-        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
-        print(f"{header} Total time: {total_time_str}")
--- a/references/depth/stereo/utils/losses.py
+++ b/references/depth/stereo/utils/losses.py
-from typing import List, Optional
-
-import torch
-from torch import nn, Tensor
-from torch.nn import functional as F
-from torchvision.prototype.models.depth.stereo.raft_stereo import grid_sample, make_coords_grid
-
-
-def make_gaussian_kernel(kernel_size: int, sigma: float) -> torch.Tensor:
-    """Function to create a 2D Gaussian kernel."""
-
-    x = torch.arange(kernel_size, dtype=torch.float32)
-    y = torch.arange(kernel_size, dtype=torch.float32)
-    x = x - (kernel_size - 1) / 2
-    y = y - (kernel_size - 1) / 2
-    x, y = torch.meshgrid(x, y)
-    grid = (x**2 + y**2) / (2 * sigma**2)
-    kernel = torch.exp(-grid)
-    kernel = kernel / kernel.sum()
-    return kernel
-
-
-def _sequence_loss_fn(
-    flow_preds: List[Tensor],
-    flow_gt: Tensor,
-    valid_flow_mask: Optional[Tensor],
-    gamma: Tensor,
-    max_flow: int = 256,
-    exclude_large: bool = False,
-    weights: Optional[Tensor] = None,
-):
-    """Loss function defined over sequence of flow predictions"""
-    torch._assert(
-        gamma < 1,
-        "sequence_loss: `gamma` must be lower than 1, but got {}".format(gamma),
-    )
-
-    if exclude_large:
-        # exclude invalid pixels and extremely large diplacements
-        flow_norm = torch.sum(flow_gt**2, dim=1).sqrt()
-        if valid_flow_mask is not None:
-            valid_flow_mask = valid_flow_mask & (flow_norm < max_flow)
-        else:
-            valid_flow_mask = flow_norm < max_flow
-
-    if valid_flow_mask is not None:
-        valid_flow_mask = valid_flow_mask.unsqueeze(1)
-    flow_preds = torch.stack(flow_preds)  # shape = (num_flow_updates, batch_size, 2, H, W)
-
-    abs_diff = (flow_preds - flow_gt).abs()
-    if valid_flow_mask is not None:
-        abs_diff = abs_diff * valid_flow_mask.unsqueeze(0)
-
-    abs_diff = abs_diff.mean(axis=(1, 2, 3, 4))
-    num_predictions = flow_preds.shape[0]
-
-    # alocating on CPU and moving to device during run-time can force
-    # an unwanted GPU synchronization that produces a large overhead
-    if weights is None or len(weights) != num_predictions:
-        weights = gamma ** torch.arange(num_predictions - 1, -1, -1, device=flow_preds.device, dtype=flow_preds.dtype)
-
-    flow_loss = (abs_diff * weights).sum()
-    return flow_loss, weights
-
-
-class SequenceLoss(nn.Module):
-    def __init__(self, gamma: float = 0.8, max_flow: int = 256, exclude_large_flows: bool = False) -> None:
-        """
-        Args:
-            gamma: value for the exponential weighting of the loss across frames
-            max_flow: maximum flow value to exclude
-            exclude_large_flows: whether to exclude large flows
-        """
-
-        super().__init__()
-        self.max_flow = max_flow
-        self.excluding_large = exclude_large_flows
-        self.register_buffer("gamma", torch.tensor([gamma]))
-        # cache the scale factor for the loss
-        self._weights = None
-
-    def forward(self, flow_preds: List[Tensor], flow_gt: Tensor, valid_flow_mask: Optional[Tensor]) -> Tensor:
-        """
-        Args:
-            flow_preds: list of flow predictions of shape (batch_size, C, H, W)
-            flow_gt: ground truth flow of shape (batch_size, C, H, W)
-            valid_flow_mask: mask of valid flow pixels of shape (batch_size, H, W)
-        """
-        loss, weights = _sequence_loss_fn(
-            flow_preds, flow_gt, valid_flow_mask, self.gamma, self.max_flow, self.excluding_large, self._weights
-        )
-        self._weights = weights
-        return loss
-
-    def set_gamma(self, gamma: float) -> None:
-        self.gamma.fill_(gamma)
-        # reset the cached scale factor
-        self._weights = None
-
-
-def _ssim_loss_fn(
-    source: Tensor,
-    reference: Tensor,
-    kernel: Tensor,
-    eps: float = 1e-8,
-    c1: float = 0.01**2,
-    c2: float = 0.03**2,
-    use_padding: bool = False,
-) -> Tensor:
-    # ref: Algorithm section: https://en.wikipedia.org/wiki/Structural_similarity
-    # ref: Alternative implementation: https://kornia.readthedocs.io/en/latest/_modules/kornia/metrics/ssim.html#ssim
-
-    torch._assert(
-        source.ndim == reference.ndim == 4,
-        "SSIM: `source` and `reference` must be 4-dimensional tensors",
-    )
-
-    torch._assert(
-        source.shape == reference.shape,
-        "SSIM: `source` and `reference` must have the same shape, but got {} and {}".format(
-            source.shape, reference.shape
-        ),
-    )
-
-    B, C, H, W = source.shape
-    kernel = kernel.unsqueeze(0).unsqueeze(0).repeat(C, 1, 1, 1)
-    if use_padding:
-        pad_size = kernel.shape[2] // 2
-        source = F.pad(source, (pad_size, pad_size, pad_size, pad_size), "reflect")
-        reference = F.pad(reference, (pad_size, pad_size, pad_size, pad_size), "reflect")
-
-    mu1 = F.conv2d(source, kernel, groups=C)
-    mu2 = F.conv2d(reference, kernel, groups=C)
-
-    mu1_sq = mu1.pow(2)
-    mu2_sq = mu2.pow(2)
-
-    mu1_mu2 = mu1 * mu2
-    mu_img1_sq = F.conv2d(source.pow(2), kernel, groups=C)
-    mu_img2_sq = F.conv2d(reference.pow(2), kernel, groups=C)
-    mu_img1_mu2 = F.conv2d(source * reference, kernel, groups=C)
-
-    sigma1_sq = mu_img1_sq - mu1_sq
-    sigma2_sq = mu_img2_sq - mu2_sq
-    sigma12 = mu_img1_mu2 - mu1_mu2
-
-    numerator = (2 * mu1_mu2 + c1) * (2 * sigma12 + c2)
-    denominator = (mu1_sq + mu2_sq + c1) * (sigma1_sq + sigma2_sq + c2)
-    ssim = numerator / (denominator + eps)
-
-    # doing 1 - ssim because we want to maximize the ssim
-    return 1 - ssim.mean(dim=(1, 2, 3))
-
-
-class SSIM(nn.Module):
-    def __init__(
-        self,
-        kernel_size: int = 11,
-        max_val: float = 1.0,
-        sigma: float = 1.5,
-        eps: float = 1e-12,
-        use_padding: bool = True,
-    ) -> None:
-        """SSIM loss function.
-
-        Args:
-            kernel_size: size of the Gaussian kernel
-            max_val: constant scaling factor
-            sigma: sigma of the Gaussian kernel
-            eps: constant for division by zero
-            use_padding: whether to pad the input tensor such that we have a score for each pixel
-        """
-        super().__init__()
-
-        self.kernel_size = kernel_size
-        self.max_val = max_val
-        self.sigma = sigma
-
-        gaussian_kernel = make_gaussian_kernel(kernel_size, sigma)
-        self.register_buffer("gaussian_kernel", gaussian_kernel)
-
-        self.c1 = (0.01 * self.max_val) ** 2
-        self.c2 = (0.03 * self.max_val) ** 2
-
-        self.use_padding = use_padding
-        self.eps = eps
-
-    def forward(self, source: torch.Tensor, reference: torch.Tensor) -> torch.Tensor:
-        """
-        Args:
-            source: source image of shape (batch_size, C, H, W)
-            reference: reference image of shape (batch_size, C, H, W)
-
-        Returns:
-            SSIM loss of shape (batch_size,)
-        """
-        return _ssim_loss_fn(
-            source,
-            reference,
-            kernel=self.gaussian_kernel,
-            c1=self.c1,
-            c2=self.c2,
-            use_padding=self.use_padding,
-            eps=self.eps,
-        )
-
-
-def _smoothness_loss_fn(img_gx: Tensor, img_gy: Tensor, val_gx: Tensor, val_gy: Tensor):
-    # ref: https://github.com/nianticlabs/monodepth2/blob/b676244e5a1ca55564eb5d16ab521a48f823af31/layers.py#L202
-
-    torch._assert(
-        img_gx.ndim >= 3,
-        "smoothness_loss: `img_gx` must be at least 3-dimensional tensor of shape (..., C, H, W)",
-    )
-
-    torch._assert(
-        img_gx.ndim == val_gx.ndim,
-        "smoothness_loss: `img_gx` and `depth_gx` must have the same dimensionality, but got {} and {}".format(
-            img_gx.ndim, val_gx.ndim
-        ),
-    )
-
-    for idx in range(img_gx.ndim):
-        torch._assert(
-            (img_gx.shape[idx] == val_gx.shape[idx] or (img_gx.shape[idx] == 1 or val_gx.shape[idx] == 1)),
-            "smoothness_loss: `img_gx` and `depth_gx` must have either the same shape or broadcastable shape, but got {} and {}".format(
-                img_gx.shape, val_gx.shape
-            ),
-        )
-
-    # -3 is channel dimension
-    weights_x = torch.exp(-torch.mean(torch.abs(val_gx), axis=-3, keepdim=True))
-    weights_y = torch.exp(-torch.mean(torch.abs(val_gy), axis=-3, keepdim=True))
-
-    smoothness_x = img_gx * weights_x
-    smoothness_y = img_gy * weights_y
-
-    smoothness = (torch.abs(smoothness_x) + torch.abs(smoothness_y)).mean(axis=(-3, -2, -1))
-    return smoothness
-
-
-class SmoothnessLoss(nn.Module):
-    def __init__(self) -> None:
-        super().__init__()
-
-    def _x_gradient(self, img: Tensor) -> Tensor:
-        if img.ndim > 4:
-            original_shape = img.shape
-            is_reshaped = True
-            img = img.reshape(-1, *original_shape[-3:])
-        else:
-            is_reshaped = False
-
-        padded = F.pad(img, (0, 1, 0, 0), mode="replicate")
-        grad = padded[..., :, :-1] - padded[..., :, 1:]
-        if is_reshaped:
-            grad = grad.reshape(original_shape)
-        return grad
-
-    def _y_gradient(self, x: torch.Tensor) -> torch.Tensor:
-        if x.ndim > 4:
-            original_shape = x.shape
-            is_reshaped = True
-            x = x.reshape(-1, *original_shape[-3:])
-        else:
-            is_reshaped = False
-
-        padded = F.pad(x, (0, 0, 0, 1), mode="replicate")
-        grad = padded[..., :-1, :] - padded[..., 1:, :]
-        if is_reshaped:
-            grad = grad.reshape(original_shape)
-        return grad
-
-    def forward(self, images: Tensor, vals: Tensor) -> Tensor:
-        """
-        Args:
-            images: tensor of shape (D1, D2, ..., DN, C, H, W)
-            depths: tensor of shape (D1, D2, ..., DN, 1, H, W)
-
-        Returns:
-            smoothness loss of shape (D1, D2, ..., DN)
-        """
-        img_gx = self._x_gradient(images)
-        img_gy = self._y_gradient(images)
-
-        val_gx = self._x_gradient(vals)
-        val_gy = self._y_gradient(vals)
-
-        return _smoothness_loss_fn(img_gx, img_gy, val_gx, val_gy)
-
-
-def _flow_sequence_consistency_loss_fn(
-    flow_preds: List[Tensor],
-    gamma: float = 0.8,
-    resize_factor: float = 0.25,
-    rescale_factor: float = 0.25,
-    rescale_mode: str = "bilinear",
-    weights: Optional[Tensor] = None,
-):
-    """Loss function defined over sequence of flow predictions"""
-
-    # Simplified version of ref: https://arxiv.org/pdf/2006.11242.pdf
-    # In the original paper, an additional refinement network is used to refine a flow prediction.
-    # Each step performed by the recurrent module in Raft or CREStereo is a refinement step using a delta_flow update.
-    # which should be consistent with the previous step. In this implementation, we simplify the overall loss
-    # term and ignore left-right consistency loss or photometric loss which can be treated separetely.
-
-    torch._assert(
-        rescale_factor <= 1.0,
-        "sequence_consistency_loss: `rescale_factor` must be less than or equal to 1, but got {}".format(
-            rescale_factor
-        ),
-    )
-
-    flow_preds = torch.stack(flow_preds)  # shape = (num_flow_updates, batch_size, 2, H, W)
-    N, B, C, H, W = flow_preds.shape
-
-    # rescale flow predictions to account for bilinear upsampling artifacts
-    if rescale_factor:
-        flow_preds = (
-            F.interpolate(
-                flow_preds.view(N * B, C, H, W), scale_factor=resize_factor, mode=rescale_mode, align_corners=True
-            )
-        ) * rescale_factor
-        flow_preds = torch.stack(torch.chunk(flow_preds, N, dim=0), dim=0)
-
-    # force the next prediction to be similar to the previous prediction
-    abs_diff = (flow_preds[1:] - flow_preds[:-1]).square()
-    abs_diff = abs_diff.mean(axis=(1, 2, 3, 4))
-
-    num_predictions = flow_preds.shape[0] - 1  # because we are comparing differences
-    if weights is None or len(weights) != num_predictions:
-        weights = gamma ** torch.arange(num_predictions - 1, -1, -1, device=flow_preds.device, dtype=flow_preds.dtype)
-
-    flow_loss = (abs_diff * weights).sum()
-    return flow_loss, weights
-
-
-class FlowSequenceConsistencyLoss(nn.Module):
-    def __init__(
-        self,
-        gamma: float = 0.8,
-        resize_factor: float = 0.25,
-        rescale_factor: float = 0.25,
-        rescale_mode: str = "bilinear",
-    ) -> None:
-        super().__init__()
-        self.gamma = gamma
-        self.resize_factor = resize_factor
-        self.rescale_factor = rescale_factor
-        self.rescale_mode = rescale_mode
-        self._weights = None
-
-    def forward(self, flow_preds: List[Tensor]) -> Tensor:
-        """
-        Args:
-            flow_preds: list of tensors of shape (batch_size, C, H, W)
-
-        Returns:
-            sequence consistency loss of shape (batch_size,)
-        """
-        loss, weights = _flow_sequence_consistency_loss_fn(
-            flow_preds,
-            gamma=self.gamma,
-            resize_factor=self.resize_factor,
-            rescale_factor=self.rescale_factor,
-            rescale_mode=self.rescale_mode,
-            weights=self._weights,
-        )
-        self._weights = weights
-        return loss
-
-    def set_gamma(self, gamma: float) -> None:
-        self.gamma.fill_(gamma)
-        # reset the cached scale factor
-        self._weights = None
-
-
-def _psnr_loss_fn(source: torch.Tensor, target: torch.Tensor, max_val: float) -> torch.Tensor:
-    torch._assert(
-        source.shape == target.shape,
-        "psnr_loss: source and target must have the same shape, but got {} and {}".format(source.shape, target.shape),
-    )
-
-    # ref https://en.wikipedia.org/wiki/Peak_signal-to-noise_ratio
-    return 10 * torch.log10(max_val**2 / ((source - target).pow(2).mean(axis=(-3, -2, -1))))
-
-
-class PSNRLoss(nn.Module):
-    def __init__(self, max_val: float = 256) -> None:
-        """
-        Args:
-            max_val: maximum value of the input tensor. This refers to the maximum domain value of the input tensor.
-
-        """
-        super().__init__()
-        self.max_val = max_val
-
-    def forward(self, source: Tensor, target: Tensor) -> Tensor:
-        """
-        Args:
-            source: tensor of shape (D1, D2, ..., DN, C, H, W)
-            target: tensor of shape (D1, D2, ..., DN, C, H, W)
-
-        Returns:
-            psnr loss of shape (D1, D2, ..., DN)
-        """
-
-        # multiply by -1 as we want to maximize the psnr
-        return -1 * _psnr_loss_fn(source, target, self.max_val)
-
-
-class FlowPhotoMetricLoss(nn.Module):
-    def __init__(
-        self,
-        ssim_weight: float = 0.85,
-        ssim_window_size: int = 11,
-        ssim_max_val: float = 1.0,
-        ssim_sigma: float = 1.5,
-        ssim_eps: float = 1e-12,
-        ssim_use_padding: bool = True,
-        max_displacement_ratio: float = 0.15,
-    ) -> None:
-        super().__init__()
-
-        self._ssim_loss = SSIM(
-            kernel_size=ssim_window_size,
-            max_val=ssim_max_val,
-            sigma=ssim_sigma,
-            eps=ssim_eps,
-            use_padding=ssim_use_padding,
-        )
-
-        self._L1_weight = 1 - ssim_weight
-        self._SSIM_weight = ssim_weight
-        self._max_displacement_ratio = max_displacement_ratio
-
-    def forward(
-        self,
-        source: Tensor,
-        reference: Tensor,
-        flow_pred: Tensor,
-        valid_mask: Optional[Tensor] = None,
-    ):
-        """
-        Args:
-            source: tensor of shape (B, C, H, W)
-            reference: tensor of shape (B, C, H, W)
-            flow_pred: tensor of shape (B, 2, H, W)
-            valid_mask: tensor of shape (B, H, W) or None
-
-        Returns:
-            photometric loss of shape
-
-        """
-        torch._assert(
-            source.ndim == 4,
-            "FlowPhotoMetricLoss: source must have 4 dimensions, but got {}".format(source.ndim),
-        )
-        torch._assert(
-            reference.ndim == source.ndim,
-            "FlowPhotoMetricLoss: source and other must have the same number of dimensions, but got {} and {}".format(
-                source.ndim, reference.ndim
-            ),
-        )
-        torch._assert(
-            flow_pred.shape[1] == 2,
-            "FlowPhotoMetricLoss: flow_pred must have 2 channels, but got {}".format(flow_pred.shape[1]),
-        )
-        torch._assert(
-            flow_pred.ndim == 4,
-            "FlowPhotoMetricLoss: flow_pred must have 4 dimensions, but got {}".format(flow_pred.ndim),
-        )
-
-        B, C, H, W = source.shape
-        flow_channels = flow_pred.shape[1]
-
-        max_displacements = []
-        for dim in range(flow_channels):
-            shape_index = -1 - dim
-            max_displacements.append(int(self._max_displacement_ratio * source.shape[shape_index]))
-
-        # mask out all pixels that have larger flow than the max flow allowed
-        max_flow_mask = torch.logical_and(
-            *[flow_pred[:, dim, :, :] < max_displacements[dim] for dim in range(flow_channels)]
-        )
-
-        if valid_mask is not None:
-            valid_mask = torch.logical_and(valid_mask, max_flow_mask).unsqueeze(1)
-        else:
-            valid_mask = max_flow_mask.unsqueeze(1)
-
-        grid = make_coords_grid(B, H, W, device=str(source.device))
-        resampled_grids = grid - flow_pred
-        resampled_grids = resampled_grids.permute(0, 2, 3, 1)
-        resampled_source = grid_sample(reference, resampled_grids, mode="bilinear")
-
-        # compute SSIM loss
-        ssim_loss = self._ssim_loss(resampled_source * valid_mask, source * valid_mask)
-        l1_loss = (resampled_source * valid_mask - source * valid_mask).abs().mean(axis=(-3, -2, -1))
-        loss = self._L1_weight * l1_loss + self._SSIM_weight * ssim_loss
-
-        return loss.mean()