merge v0.16.0

cc26cd81 · panning · f78f29f5 · fbb4cc54 · cc26cd81 · cc26cd81
Commit cc26cd81 authored Nov 27, 2023 by panning
20 changed files
--- a/references/classification/README.md
+++ b/references/classification/README.md
@@ -298,7 +298,7 @@ Here `$MODEL` is one of `googlenet`, `inception_v3`, `resnet18`, `resnet50`, `re
 ### Quantized ShuffleNet V2
-Here are commands that we use to quantized the `shufflenet_v2_x1_5` and `shufflenet_v2_x2_0` models.
+Here are commands that we use to quantize the `shufflenet_v2_x1_5` and `shufflenet_v2_x2_0` models.
 ```
 # For shufflenet_v2_x1_5
 python train_quantization.py --device='cpu' --post-training-quantize --backend='fbgemm' \

--- a/references/classification/presets.py
+++ b/references/classification/presets.py
 import torch
-from torchvision.transforms import autoaugment, transforms
 from torchvision.transforms.functional import InterpolationMode
+def get_module(use_v2):
+    # We need a protected import to avoid the V2 warning in case just V1 is used
+    if use_v2:
+        import torchvision.transforms.v2
+        return torchvision.transforms.v2
+    else:
+        import torchvision.transforms
+        return torchvision.transforms
 class ClassificationPresetTrain:
+    # Note: this transform assumes that the input to forward() are always PIL
+    # images, regardless of the backend parameter. We may change that in the
+    # future though, if we change the output type from the dataset.
    def __init__(
        self,
        *,
@@ -16,31 +30,48 @@ class ClassificationPresetTrain:
        ra_magnitude=9,
        augmix_severity=3,
        random_erase_prob=0.0,
+        backend="pil",
+        use_v2=False,
    ):
-        trans = [transforms.RandomResizedCrop(crop_size, interpolation=interpolation)]
+        T = get_module(use_v2)
+        transforms = []
+        backend = backend.lower()
+        if backend == "tensor":
+            transforms.append(T.PILToTensor())
+        elif backend != "pil":
+            raise ValueError(f"backend can be 'tensor' or 'pil', but got {backend}")
+        transforms.append(T.RandomResizedCrop(crop_size, interpolation=interpolation, antialias=True))
        if hflip_prob > 0:
-            trans.append(transforms.RandomHorizontalFlip(hflip_prob))
+            transforms.append(T.RandomHorizontalFlip(hflip_prob))
        if auto_augment_policy is not None:
            if auto_augment_policy == "ra":
-                trans.append(autoaugment.RandAugment(interpolation=interpolation, magnitude=ra_magnitude))
+                transforms.append(T.RandAugment(interpolation=interpolation, magnitude=ra_magnitude))
            elif auto_augment_policy == "ta_wide":
-                trans.append(autoaugment.TrivialAugmentWide(interpolation=interpolation))
+                transforms.append(T.TrivialAugmentWide(interpolation=interpolation))
            elif auto_augment_policy == "augmix":
-                trans.append(autoaugment.AugMix(interpolation=interpolation, severity=augmix_severity))
+                transforms.append(T.AugMix(interpolation=interpolation, severity=augmix_severity))
            else:
-                aa_policy = autoaugment.AutoAugmentPolicy(auto_augment_policy)
+                aa_policy = T.AutoAugmentPolicy(auto_augment_policy)
-                trans.append(autoaugment.AutoAugment(policy=aa_policy, interpolation=interpolation))
+                transforms.append(T.AutoAugment(policy=aa_policy, interpolation=interpolation))
-        trans.extend(
+        if backend == "pil":
+            transforms.append(T.PILToTensor())
+        transforms.extend(
            [
-                transforms.PILToTensor(),
+                T.ToDtype(torch.float, scale=True) if use_v2 else T.ConvertImageDtype(torch.float),
-                transforms.ConvertImageDtype(torch.float),
+                T.Normalize(mean=mean, std=std),
-                transforms.Normalize(mean=mean, std=std),
            ]
        )
        if random_erase_prob > 0:
-            trans.append(transforms.RandomErasing(p=random_erase_prob))
+            transforms.append(T.RandomErasing(p=random_erase_prob))
-        self.transforms = transforms.Compose(trans)
+        if use_v2:
+            transforms.append(T.ToPureTensor())
+        self.transforms = T.Compose(transforms)
    def __call__(self, img):
        return self.transforms(img)
@@ -55,17 +86,34 @@ class ClassificationPresetEval:
        mean=(0.485, 0.456, 0.406),
        std=(0.229, 0.224, 0.225),
        interpolation=InterpolationMode.BILINEAR,
+        backend="pil",
+        use_v2=False,
    ):
+        T = get_module(use_v2)
+        transforms = []
+        backend = backend.lower()
+        if backend == "tensor":
+            transforms.append(T.PILToTensor())
+        elif backend != "pil":
+            raise ValueError(f"backend can be 'tensor' or 'pil', but got {backend}")
-        self.transforms = transforms.Compose(
+        transforms += [
-            [
+            T.Resize(resize_size, interpolation=interpolation, antialias=True),
-                transforms.Resize(resize_size, interpolation=interpolation),
+            T.CenterCrop(crop_size),
-                transforms.CenterCrop(crop_size),
+        ]
-                transforms.PILToTensor(),
-                transforms.ConvertImageDtype(torch.float),
+        if backend == "pil":
-                transforms.Normalize(mean=mean, std=std),
+            transforms.append(T.PILToTensor())
-            ]
-        )
+        transforms += [
+            T.ToDtype(torch.float, scale=True) if use_v2 else T.ConvertImageDtype(torch.float),
+            T.Normalize(mean=mean, std=std),
+        ]
+        if use_v2:
+            transforms.append(T.ToPureTensor())
+        self.transforms = T.Compose(transforms)
    def __call__(self, img):
        return self.transforms(img)
--- a/references/classification/train.py
+++ b/references/classification/train.py
@@ -7,12 +7,13 @@ import presets
 import torch
 import torch.utils.data
 import torchvision
-import transforms
+import torchvision.transforms
 import utils
 from sampler import RASampler
 from torch import nn
 from torch.utils.data.dataloader import default_collate
 from torchvision.transforms.functional import InterpolationMode
+from transforms import get_mixup_cutmix
 def train_one_epoch(model, criterion, optimizer, data_loader, device, epoch, args, model_ema=None, scaler=None):
@@ -128,10 +129,12 @@ def load_data(traindir, valdir, args):
        print(f"Loading dataset_train from {cache_path}")
        dataset, _ = torch.load(cache_path)
    else:
+        # We need a default value for the variables below because args may come
+        # from train_quantization.py which doesn't define them.
        auto_augment_policy = getattr(args, "auto_augment", None)
        random_erase_prob = getattr(args, "random_erase", 0.0)
-        ra_magnitude = args.ra_magnitude
+        ra_magnitude = getattr(args, "ra_magnitude", None)
-        augmix_severity = args.augmix_severity
+        augmix_severity = getattr(args, "augmix_severity", None)
        dataset = torchvision.datasets.ImageFolder(
            traindir,
            presets.ClassificationPresetTrain(
@@ -141,6 +144,8 @@ def load_data(traindir, valdir, args):
                random_erase_prob=random_erase_prob,
                ra_magnitude=ra_magnitude,
                augmix_severity=augmix_severity,
+                backend=args.backend,
+                use_v2=args.use_v2,
            ),
        )
        if args.cache_dataset:
@@ -158,10 +163,17 @@ def load_data(traindir, valdir, args):
    else:
        if args.weights and args.test_only:
            weights = torchvision.models.get_weight(args.weights)
-            preprocessing = weights.transforms()
+            preprocessing = weights.transforms(antialias=True)
+            if args.backend == "tensor":
+                preprocessing = torchvision.transforms.Compose([torchvision.transforms.PILToTensor(), preprocessing])
        else:
            preprocessing = presets.ClassificationPresetEval(
-                crop_size=val_crop_size, resize_size=val_resize_size, interpolation=interpolation
+                crop_size=val_crop_size,
+                resize_size=val_resize_size,
+                interpolation=interpolation,
+                backend=args.backend,
+                use_v2=args.use_v2,
            )
        dataset_test = torchvision.datasets.ImageFolder(
@@ -206,18 +218,17 @@ def main(args):
    val_dir = os.path.join(args.data_path, "val")
    dataset, dataset_test, train_sampler, test_sampler = load_data(train_dir, val_dir, args)
-    collate_fn = None
    num_classes = len(dataset.classes)
-    mixup_transforms = []
+    mixup_cutmix = get_mixup_cutmix(
-    if args.mixup_alpha > 0.0:
+        mixup_alpha=args.mixup_alpha, cutmix_alpha=args.cutmix_alpha, num_categories=num_classes, use_v2=args.use_v2
-        mixup_transforms.append(transforms.RandomMixup(num_classes, p=1.0, alpha=args.mixup_alpha))
+    )
-    if args.cutmix_alpha > 0.0:
+    if mixup_cutmix is not None:
-        mixup_transforms.append(transforms.RandomCutmix(num_classes, p=1.0, alpha=args.cutmix_alpha))
-    if mixup_transforms:
-        mixupcutmix = torchvision.transforms.RandomChoice(mixup_transforms)
        def collate_fn(batch):
-            return mixupcutmix(*default_collate(batch))
+            return mixup_cutmix(*default_collate(batch))
+    else:
+        collate_fn = default_collate
    data_loader = torch.utils.data.DataLoader(
        dataset,
@@ -314,11 +325,11 @@ def main(args):
    model_ema = None
    if args.model_ema:
-        # Decay adjustment that aims to keep the decay independent from other hyper-parameters originally proposed at:
+        # Decay adjustment that aims to keep the decay independent of other hyper-parameters originally proposed at:
        # https://github.com/facebookresearch/pycls/blob/f8cd9627/pycls/core/net.py#L123
        #
        # total_ema_updates = (Dataset_size / n_GPUs) * epochs / (batch_size_per_gpu * EMA_steps)
-        # We consider constant = Dataset_size for a given dataset/setup and ommit it. Thus:
+        # We consider constant = Dataset_size for a given dataset/setup and omit it. Thus:
        # adjust = 1 / total_ema_updates ~= n_GPUs * batch_size_per_gpu * EMA_steps / epochs
        adjust = args.world_size * args.batch_size * args.model_ema_steps / args.epochs
        alpha = 1.0 - args.model_ema_decay
@@ -505,6 +516,8 @@ def get_args_parser(add_help=True):
        "--ra-reps", default=3, type=int, help="number of repetitions for Repeated Augmentation (default: 3)"
    )
    parser.add_argument("--weights", default=None, type=str, help="the weights enum name to load")
+    parser.add_argument("--backend", default="PIL", type=str.lower, help="PIL or tensor - case insensitive")
+    parser.add_argument("--use-v2", action="store_true", help="Use V2 transforms")
    return parser

--- a/references/classification/transforms.py
+++ b/references/classification/transforms.py
@@ -2,12 +2,35 @@ import math
 from typing import Tuple
 import torch
+from presets import get_module
 from torch import Tensor
 from torchvision.transforms import functional as F
-class RandomMixup(torch.nn.Module):
+def get_mixup_cutmix(*, mixup_alpha, cutmix_alpha, num_categories, use_v2):
-    """Randomly apply Mixup to the provided batch and targets.
+    transforms_module = get_module(use_v2)
+    mixup_cutmix = []
+    if mixup_alpha > 0:
+        mixup_cutmix.append(
+            transforms_module.MixUp(alpha=mixup_alpha, num_categories=num_categories)
+            if use_v2
+            else RandomMixUp(num_classes=num_categories, p=1.0, alpha=mixup_alpha)
+        )
+    if cutmix_alpha > 0:
+        mixup_cutmix.append(
+            transforms_module.CutMix(alpha=mixup_alpha, num_categories=num_categories)
+            if use_v2
+            else RandomCutMix(num_classes=num_categories, p=1.0, alpha=mixup_alpha)
+        )
+    if not mixup_cutmix:
+        return None
+    return transforms_module.RandomChoice(mixup_cutmix)
+class RandomMixUp(torch.nn.Module):
+    """Randomly apply MixUp to the provided batch and targets.
    The class implements the data augmentations as described in the paper
    `"mixup: Beyond Empirical Risk Minimization" <https://arxiv.org/abs/1710.09412>`_.
@@ -89,8 +112,8 @@ class RandomMixup(torch.nn.Module):
        return s
-class RandomCutmix(torch.nn.Module):
+class RandomCutMix(torch.nn.Module):
-    """Randomly apply Cutmix to the provided batch and targets.
+    """Randomly apply CutMix to the provided batch and targets.
    The class implements the data augmentations as described in the paper
    `"CutMix: Regularization Strategy to Train Strong Classifiers with Localizable Features"
    <https://arxiv.org/abs/1905.04899>`_.

--- a/references/classification/utils.py
+++ b/references/classification/utils.py
@@ -365,12 +365,12 @@ def store_model_weights(model, checkpoint_path, checkpoint_key="model", strict=T
    checkpoint_path = os.path.abspath(checkpoint_path)
    output_dir = os.path.dirname(checkpoint_path)
-    # Deep copy to avoid side-effects on the model object.
+    # Deep copy to avoid side effects on the model object.
    model = copy.deepcopy(model)
    checkpoint = torch.load(checkpoint_path, map_location="cpu")
    # Load the weights to the model to validate that everything works
-    # and remove unnecessary weights (such as auxiliaries, etc)
+    # and remove unnecessary weights (such as auxiliaries, etc.)
    if checkpoint_key == "model_ema":
        del checkpoint[checkpoint_key]["n_averaged"]
        torch.nn.modules.utils.consume_prefix_in_state_dict_if_present(checkpoint[checkpoint_key], "module.")

--- a/references/depth/stereo/README.md
+++ b/references/depth/stereo/README.md
+# Stereo Matching reference training scripts
+This folder contains reference training scripts for Stereo Matching.
+They serve as a log of how to train specific models, so as to provide baseline
+training and evaluation scripts to quickly bootstrap research.
+### CREStereo
+The CREStereo model was trained on a dataset mixture between **CREStereo**, **ETH3D** and the additional split from **Middlebury2014**.
+A ratio of **88-6-6** was used in order to train a baseline weight set. We provide multi-set variant as well.
+Both used 8 A100 GPUs and a batch size of 2 (so effective batch size is 16). The
+rest of the hyper-parameters loosely follow the recipe from https://github.com/megvii-research/CREStereo.
+The original recipe trains for **300000** updates (or steps) on the dataset mixture. We modify the learning rate
+schedule to one that starts decaying the weight much sooner. Throughout the experiments we found that this reduces 
+overfitting during evaluation time and gradient clip help stabilize the loss during a pre-mature learning rate change.
+```
+torchrun --nproc_per_node 8 --nnodes 1 train.py \
+    --dataset-root $dataset_root \
+    --name $name_cre \
+    --model crestereo_base \
+    --train-datasets crestereo eth3d-train middlebury2014-other \
+    --dataset-steps 264000 18000 18000
+    --batch-size 2 \
+    --lr 0.0004 \
+    --min-lr 0.00002 \
+    --lr-decay-method cosine \
+    --warmup-steps 6000 \
+    --decay-after-steps 30000 \
+    --clip-grad-norm 1.0 \
+```
+We employ a multi-set fine-tuning stage where we uniformly sample from multiple datasets. Given hat some of these datasets have extremely large images (``2048x2048`` or more) we opt for a very aggressive scale-range ``[0.2 - 0.8]`` such that as much of the original frame composition is captured inside the ``384x512`` crop.
+```
+torchrun --nproc_per_node 8 --nnodes 1 train.py \
+    --dataset-root $dataset_root \
+    --name $name_things \
+    --model crestereo_base \
+    --train-datasets crestereo eth3d-train middlebury2014-other instereo2k fallingthings carla-highres sintel sceneflow-monkaa sceneflow-driving \
+    --dataset-steps 12000 12000 12000 12000 12000 12000 12000 12000 12000
+    --batch-size 2 \
+    --scale-range 0.2 0.8 \
+    --lr 0.0004 \
+    --lr-decay-method cosine \
+    --decay-after-steps 0 \
+    --warmup-steps 0 \
+    --min-lr 0.00002 \
+    --resume-path $checkpoint_dir/$name_cre.pth
+```
+### Evaluation
+Evaluating the base weights
+```
+torchrun --nproc_per_node 1 --nnodes 1 cascade_evaluation.py --dataset middlebury2014-train --batch-size 1 --dataset-root $dataset_root --model crestereo_base --weights CREStereo_Base_Weights.CRESTEREO_ETH_MBL_V1
+```
+This should give an **mae of about 1.416** on the train set of `Middlebury2014`. Results may vary slightly depending on the batch size and the number of GPUs. For the most accurate results use 1 GPU and `--batch-size 1`. The created log file should look like this, where the first key is the number of cascades and the nested key is the number of recursive iterations:
+```
+Dataset: middlebury2014-train @size: [384, 512]:
+{
+	1: {
+		2: {'mae': 2.363, 'rmse': 4.352, '1px': 0.611, '3px': 0.828, '5px': 0.891, 'relepe': 0.176, 'fl-all': 64.511}
+		5: {'mae': 1.618, 'rmse': 3.71, '1px': 0.761, '3px': 0.879, '5px': 0.918, 'relepe': 0.154, 'fl-all': 77.128}
+		10: {'mae': 1.416, 'rmse': 3.53, '1px': 0.777, '3px': 0.896, '5px': 0.933, 'relepe': 0.148, 'fl-all': 78.388}
+		20: {'mae': 1.448, 'rmse': 3.583, '1px': 0.771, '3px': 0.893, '5px': 0.931, 'relepe': 0.145, 'fl-all': 77.7}
+	},
+}
+{
+	2: {
+		2: {'mae': 1.972, 'rmse': 4.125, '1px': 0.73, '3px': 0.865, '5px': 0.908, 'relepe': 0.169, 'fl-all': 74.396}
+		5: {'mae': 1.403, 'rmse': 3.448, '1px': 0.793, '3px': 0.905, '5px': 0.937, 'relepe': 0.151, 'fl-all': 80.186}
+		10: {'mae': 1.312, 'rmse': 3.368, '1px': 0.799, '3px': 0.912, '5px': 0.943, 'relepe': 0.148, 'fl-all': 80.379}
+		20: {'mae': 1.376, 'rmse': 3.542, '1px': 0.796, '3px': 0.91, '5px': 0.942, 'relepe': 0.149, 'fl-all': 80.054}
+	},
+}
+```
+You can also evaluate the Finetuned weights:
+```
+torchrun --nproc_per_node 1 --nnodes 1 cascade_evaluation.py --dataset middlebury2014-train --batch-size 1 --dataset-root $dataset_root --model crestereo_base --weights CREStereo_Base_Weights.CRESTEREO_FINETUNE_MULTI_V1
+```
+```
+Dataset: middlebury2014-train @size: [384, 512]:
+{
+	1: {
+		2: {'mae': 1.85, 'rmse': 3.797, '1px': 0.673, '3px': 0.862, '5px': 0.917, 'relepe': 0.171, 'fl-all': 69.736}
+		5: {'mae': 1.111, 'rmse': 3.166, '1px': 0.838, '3px': 0.93, '5px': 0.957, 'relepe': 0.134, 'fl-all': 84.596}
+		10: {'mae': 1.02, 'rmse': 3.073, '1px': 0.854, '3px': 0.938, '5px': 0.96, 'relepe': 0.129, 'fl-all': 86.042}
+		20: {'mae': 0.993, 'rmse': 3.059, '1px': 0.855, '3px': 0.942, '5px': 0.967, 'relepe': 0.126, 'fl-all': 85.784}
+	},
+}
+{
+	2: {
+		2: {'mae': 1.667, 'rmse': 3.867, '1px': 0.78, '3px': 0.891, '5px': 0.922, 'relepe': 0.165, 'fl-all': 78.89}
+		5: {'mae': 1.158, 'rmse': 3.278, '1px': 0.843, '3px': 0.926, '5px': 0.955, 'relepe': 0.135, 'fl-all': 84.556}
+		10: {'mae': 1.046, 'rmse': 3.13, '1px': 0.85, '3px': 0.934, '5px': 0.96, 'relepe': 0.13, 'fl-all': 85.464}
+		20: {'mae': 1.021, 'rmse': 3.102, '1px': 0.85, '3px': 0.935, '5px': 0.963, 'relepe': 0.129, 'fl-all': 85.417}
+	},
+}
+```
+Evaluating the author provided weights:
+```
+torchrun --nproc_per_node 1 --nnodes 1 cascade_evaluation.py --dataset middlebury2014-train --batch-size 1 --dataset-root $dataset_root --model crestereo_base --weights CREStereo_Base_Weights.MEGVII_V1
+```
+```
+Dataset: middlebury2014-train @size: [384, 512]:
+{
+	1: {
+		2: {'mae': 1.704, 'rmse': 3.738, '1px': 0.738, '3px': 0.896, '5px': 0.933, 'relepe': 0.157, 'fl-all': 76.464}
+		5: {'mae': 0.956, 'rmse': 2.963, '1px': 0.88, '3px': 0.948, '5px': 0.965, 'relepe': 0.124, 'fl-all': 88.186}
+		10: {'mae': 0.792, 'rmse': 2.765, '1px': 0.905, '3px': 0.958, '5px': 0.97, 'relepe': 0.114, 'fl-all': 90.429}
+		20: {'mae': 0.749, 'rmse': 2.706, '1px': 0.907, '3px': 0.961, '5px': 0.972, 'relepe': 0.113, 'fl-all': 90.807}
+	},
+}
+{
+	2: {
+		2: {'mae': 1.702, 'rmse': 3.784, '1px': 0.784, '3px': 0.894, '5px': 0.924, 'relepe': 0.172, 'fl-all': 80.313}
+		5: {'mae': 0.932, 'rmse': 2.907, '1px': 0.877, '3px': 0.944, '5px': 0.963, 'relepe': 0.125, 'fl-all': 87.979}
+		10: {'mae': 0.773, 'rmse': 2.768, '1px': 0.901, '3px': 0.958, '5px': 0.972, 'relepe': 0.117, 'fl-all': 90.43}
+		20: {'mae': 0.854, 'rmse': 2.971, '1px': 0.9, '3px': 0.957, '5px': 0.97, 'relepe': 0.122, 'fl-all': 90.269}
+	},
+}
+```
+# Concerns when training
+We encourage users to be aware of the **aspect-ratio** and **disparity scale** they are targeting when doing any sort of training or fine-tuning. The model is highly sensitive to these two factors, as a consequence of naive multi-set fine-tuning one can achieve `0.2 mae` relatively fast. We recommend that users pay close attention to how they **balance dataset sizing** when training such networks.
+ Ideally, dataset scaling should be trated at an individual level and a thorough **EDA** of the disparity distribution in random crops at the desired training / inference size should be performed prior to any large compute investments.
+### Disparity scaling
+##### Sample A
+ The top row contains a sample from `Sintel` whereas the bottom row one from `Middlebury`.
+![Disparity1](assets/disparity-domain-drift.jpg)
+From left to right (`left_image`, `right_image`, `valid_mask`, `valid_mask & ground_truth`, `prediction`). **Darker is further away, lighter is closer**. In the case of `Sintel` which is more closely aligned to the original distribution of `CREStereo` we notice that the model accurately predicts the background scale whereas in the case of `Middlebury2014` it cannot correctly estimate the continuous disparity. Notice that the frame composition is similar for both examples. The blue skybox in the `Sintel` scene behaves similarly to the `Middlebury` black background. However, because the `Middlebury` samples comes from an extremely large scene the crop size of `384x512` does not correctly capture the general training distribution.
+##### Sample B
+The top row contains a scene from `Sceneflow` using the `Monkaa` split whilst the bottom row is a scene from `Middlebury`. This sample exhibits the same issues when it comes to **background estimation**. Given the exaggerated size of the `Middlebury` samples the model **colapses the smooth background** of the sample to what it considers to be a mean background disparity value.
+![Disparity2](assets/disparity-background-mode-collapse.jpg)
+For more detail on why this behaviour occurs based on the training distribution proportions you can read more about the network at: https://github.com/pytorch/vision/pull/6629#discussion_r978160493
+### Metric overfitting
+##### Learning is critical in the beginning
+We also advise users to make user of faster training schedules, as the performance gain over long periods time is marginal. Here we exhibit a difference between a faster decay schedule and later decay schedule.
+![Loss1](assets/Loss.jpg)
+In **grey** we set the lr decay to begin after `30000` steps whilst in **orange** we opt for a very late learning rate decay at around `180000` steps. Although exhibiting stronger variance, we can notice that unfreezing the learning rate earlier whilst employing `gradient-norm` out-performs the default configuration.
+##### Gradient norm saves time
+![Loss2](assets/gradient-norm-removal.jpg)
+In **grey** we keep ``gradient norm`` enabled whilst in **orange** we do not. We can notice that remvoing the gradient norm exacerbates the performance decrease in the early stages whilst also showcasing an almost complete collapse around the `60000` steps mark where we started decaying the lr for **orange**.
+Although both runs ahieve an improvement of about ``0.1`` mae after the lr decay start, the benefits of it are observable much faster when ``gradient norm`` is employed as the recovery period is no longer accounted for.
--- a/references/depth/stereo/cascade_evaluation.py
+++ b/references/depth/stereo/cascade_evaluation.py
+import os
+import warnings
+import torch
+import torchvision
+import torchvision.prototype.models.depth.stereo
+import utils
+from torch.nn import functional as F
+from train import make_eval_loader
+from utils.metrics import AVAILABLE_METRICS
+from visualization import make_prediction_image_side_to_side
+def get_args_parser(add_help=True):
+    import argparse
+    parser = argparse.ArgumentParser(description="PyTorch Stereo Matching Evaluation", add_help=add_help)
+    parser.add_argument("--dataset", type=str, default="middlebury2014-train", help="dataset to use")
+    parser.add_argument("--dataset-root", type=str, default="", help="root of the dataset")
+    parser.add_argument("--checkpoint", type=str, default="", help="path to weights")
+    parser.add_argument("--weights", type=str, default=None, help="torchvision API weight")
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="crestereo_base",
+        help="which model to use if not speciffying a training checkpoint",
+    )
+    parser.add_argument("--img-folder", type=str, default="images")
+    parser.add_argument("--batch-size", type=int, default=1, help="batch size")
+    parser.add_argument("--workers", type=int, default=0, help="number of workers")
+    parser.add_argument("--eval-size", type=int, nargs="+", default=[384, 512], help="resize size")
+    parser.add_argument(
+        "--norm-mean", type=float, nargs="+", default=[0.5, 0.5, 0.5], help="mean for image normalization"
+    )
+    parser.add_argument(
+        "--norm-std", type=float, nargs="+", default=[0.5, 0.5, 0.5], help="std for image normalization"
+    )
+    parser.add_argument(
+        "--use-grayscale", action="store_true", help="use grayscale images instead of RGB", default=False
+    )
+    parser.add_argument("--max-disparity", type=float, default=None, help="maximum disparity")
+    parser.add_argument(
+        "--interpolation-strategy",
+        type=str,
+        default="bilinear",
+        help="interpolation strategy",
+        choices=["bilinear", "bicubic", "mixed"],
+    )
+    parser.add_argument("--n_iterations", nargs="+", type=int, default=[10], help="number of recurent iterations")
+    parser.add_argument("--n_cascades", nargs="+", type=int, default=[1], help="number of cascades")
+    parser.add_argument(
+        "--metrics",
+        type=str,
+        nargs="+",
+        default=["mae", "rmse", "1px", "3px", "5px", "relepe"],
+        help="metrics to log",
+        choices=AVAILABLE_METRICS,
+    )
+    parser.add_argument("--mixed-precision", action="store_true", help="use mixed precision training")
+    parser.add_argument("--world-size", type=int, default=1, help="number of distributed processes")
+    parser.add_argument("--dist-url", type=str, default="env://", help="url used to set up distributed training")
+    parser.add_argument("--device", type=str, default="cuda", help="device to use for training")
+    parser.add_argument("--save-images", action="store_true", help="save images of the predictions")
+    parser.add_argument("--padder-type", type=str, default="kitti", help="padder type", choices=["kitti", "sintel"])
+    return parser
+def cascade_inference(model, image_left, image_right, iterations, cascades):
+    # check that image size is divisible by 16 * (2 ** (cascades - 1))
+    for image in [image_left, image_right]:
+        if image.shape[-2] % ((2 ** (cascades - 1))) != 0:
+            raise ValueError(
+                f"image height is not divisible by {16 * (2 ** (cascades - 1))}. Image shape: {image.shape[-2]}"
+            )
+        if image.shape[-1] % ((2 ** (cascades - 1))) != 0:
+            raise ValueError(
+                f"image width is not divisible by {16 * (2 ** (cascades - 1))}. Image shape: {image.shape[-2]}"
+            )
+    left_image_pyramid = [image_left]
+    right_image_pyramid = [image_right]
+    for idx in range(0, cascades - 1):
+        ds_factor = int(2 ** (idx + 1))
+        ds_shape = (image_left.shape[-2] // ds_factor, image_left.shape[-1] // ds_factor)
+        left_image_pyramid += F.interpolate(image_left, size=ds_shape, mode="bilinear", align_corners=True).unsqueeze(0)
+        right_image_pyramid += F.interpolate(image_right, size=ds_shape, mode="bilinear", align_corners=True).unsqueeze(
+            0
+        )
+    flow_init = None
+    for left_image, right_image in zip(reversed(left_image_pyramid), reversed(right_image_pyramid)):
+        flow_pred = model(left_image, right_image, flow_init, num_iters=iterations)
+        # flow pred is a list
+        flow_init = flow_pred[-1]
+    return flow_init
+@torch.inference_mode()
+def _evaluate(
+    model,
+    args,
+    val_loader,
+    *,
+    padder_mode,
+    print_freq=10,
+    writer=None,
+    step=None,
+    iterations=10,
+    cascades=1,
+    batch_size=None,
+    header=None,
+    save_images=False,
+    save_path="",
+):
+    """Helper function to compute various metrics (epe, etc.) for a model on a given dataset.
+    We process as many samples as possible with ddp.
+    """
+    model.eval()
+    header = header or "Test:"
+    device = torch.device(args.device)
+    metric_logger = utils.MetricLogger(delimiter="  ")
+    iterations = iterations or args.recurrent_updates
+    logger = utils.MetricLogger()
+    for meter_name in args.metrics:
+        logger.add_meter(meter_name, fmt="{global_avg:.4f}")
+    if "fl-all" not in args.metrics:
+        logger.add_meter("fl-all", fmt="{global_avg:.4f}")
+    num_processed_samples = 0
+    with torch.cuda.amp.autocast(enabled=args.mixed_precision, dtype=torch.float16):
+        batch_idx = 0
+        for blob in metric_logger.log_every(val_loader, print_freq, header):
+            image_left, image_right, disp_gt, valid_disp_mask = (x.to(device) for x in blob)
+            padder = utils.InputPadder(image_left.shape, mode=padder_mode)
+            image_left, image_right = padder.pad(image_left, image_right)
+            disp_pred = cascade_inference(model, image_left, image_right, iterations, cascades)
+            disp_pred = disp_pred[:, :1, :, :]
+            disp_pred = padder.unpad(disp_pred)
+            if save_images:
+                if args.distributed:
+                    rank_prefix = args.rank
+                else:
+                    rank_prefix = 0
+                make_prediction_image_side_to_side(
+                    disp_pred, disp_gt, valid_disp_mask, save_path, prefix=f"batch_{rank_prefix}_{batch_idx}"
+                )
+            metrics, _ = utils.compute_metrics(disp_pred, disp_gt, valid_disp_mask, metrics=logger.meters.keys())
+            num_processed_samples += image_left.shape[0]
+            for name in metrics:
+                logger.meters[name].update(metrics[name], n=1)
+            batch_idx += 1
+    num_processed_samples = utils.reduce_across_processes(num_processed_samples) / args.world_size
+    print("Num_processed_samples: ", num_processed_samples)
+    if (
+        hasattr(val_loader.dataset, "__len__")
+        and len(val_loader.dataset) != num_processed_samples
+        and torch.distributed.get_rank() == 0
+    ):
+        warnings.warn(
+            f"Number of processed samples {num_processed_samples} is different"
+            f"from the dataset size {len(val_loader.dataset)}. This may happen if"
+            "the dataset is not divisible by the batch size. Try lowering the batch size for more accurate results."
+        )
+    if writer is not None and args.rank == 0:
+        for meter_name, meter_value in logger.meters.items():
+            scalar_name = f"{meter_name} {header}"
+            writer.add_scalar(scalar_name, meter_value.avg, step)
+    logger.synchronize_between_processes()
+    print(header, logger)
+    logger_metrics = {k: v.global_avg for k, v in logger.meters.items()}
+    return logger_metrics
+def evaluate(model, loader, args, writer=None, step=None):
+    os.makedirs(args.img_folder, exist_ok=True)
+    checkpoint_name = os.path.basename(args.checkpoint) or args.weights
+    image_checkpoint_folder = os.path.join(args.img_folder, checkpoint_name)
+    metrics = {}
+    base_image_folder = os.path.join(image_checkpoint_folder, args.dataset)
+    os.makedirs(base_image_folder, exist_ok=True)
+    for n_cascades in args.n_cascades:
+        for n_iters in args.n_iterations:
+            config = f"{n_cascades}c_{n_iters}i"
+            config_image_folder = os.path.join(base_image_folder, config)
+            os.makedirs(config_image_folder, exist_ok=True)
+            metrics[config] = _evaluate(
+                model,
+                args,
+                loader,
+                padder_mode=args.padder_type,
+                header=f"{args.dataset} evaluation@ size:{args.eval_size} n_cascades:{n_cascades} n_iters:{n_iters}",
+                batch_size=args.batch_size,
+                writer=writer,
+                step=step,
+                iterations=n_iters,
+                cascades=n_cascades,
+                save_path=config_image_folder,
+                save_images=args.save_images,
+            )
+    metric_log = []
+    metric_log_dict = {}
+    # print the final results
+    for config in metrics:
+        config_tokens = config.split("_")
+        config_iters = config_tokens[1][:-1]
+        config_cascades = config_tokens[0][:-1]
+        metric_log_dict[config_cascades] = metric_log_dict.get(config_cascades, {})
+        metric_log_dict[config_cascades][config_iters] = metrics[config]
+        evaluation_str = f"{args.dataset} evaluation@ size:{args.eval_size} n_cascades:{config_cascades} recurrent_updates:{config_iters}"
+        metrics_str = f"Metrics: {metrics[config]}"
+        metric_log.extend([evaluation_str, metrics_str])
+        print(evaluation_str)
+        print(metrics_str)
+    eval_log_name = f"{checkpoint_name.replace('.pth', '')}_eval.log"
+    print("Saving eval log to: ", eval_log_name)
+    with open(eval_log_name, "w") as f:
+        f.write(f"Dataset: {args.dataset} @size: {args.eval_size}:\n")
+        # write the dict line by line for each key, and each value in the keys
+        for config_cascades in metric_log_dict:
+            f.write("{\n")
+            f.write(f"\t{config_cascades}: {{\n")
+            for config_iters in metric_log_dict[config_cascades]:
+                # convert every metric to 4 decimal places
+                metrics = metric_log_dict[config_cascades][config_iters]
+                metrics = {k: float(f"{v:.3f}") for k, v in metrics.items()}
+                f.write(f"\t\t{config_iters}: {metrics}\n")
+            f.write("\t},\n")
+            f.write("}\n")
+def load_checkpoint(args):
+    utils.setup_ddp(args)
+    if not args.weights:
+        checkpoint = torch.load(args.checkpoint, map_location=torch.device("cpu"))
+        if "model" in checkpoint:
+            experiment_args = checkpoint["args"]
+            model = torchvision.prototype.models.depth.stereo.__dict__[experiment_args.model](weights=None)
+            model.load_state_dict(checkpoint["model"])
+        else:
+            model = torchvision.prototype.models.depth.stereo.__dict__[args.model](weights=None)
+            model.load_state_dict(checkpoint)
+        # set the appropriate devices
+        if args.distributed and args.device == "cpu":
+            raise ValueError("The device must be cuda if we want to run in distributed mode using torchrun")
+        device = torch.device(args.device)
+    else:
+        model = torchvision.prototype.models.depth.stereo.__dict__[args.model](weights=args.weights)
+    # convert to DDP if need be
+    if args.distributed:
+        model = model.to(args.device)
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
+    else:
+        model.to(device)
+    return model
+def main(args):
+    model = load_checkpoint(args)
+    loader = make_eval_loader(args.dataset, args)
+    evaluate(model, loader, args)
+if __name__ == "__main__":
+    args = get_args_parser().parse_args()
+    main(args)
--- a/references/depth/stereo/train.py
+++ b/references/depth/stereo/train.py
+import argparse
+import os
+import warnings
+from pathlib import Path
+from typing import List, Union
+import numpy as np
+import torch
+import torch.distributed as dist
+import torchvision.models.optical_flow
+import torchvision.prototype.models.depth.stereo
+import utils
+import visualization
+from parsing import make_dataset, make_eval_transform, make_train_transform, VALID_DATASETS
+from torch import nn
+from torchvision.transforms.functional import get_dimensions, InterpolationMode, resize
+from utils.metrics import AVAILABLE_METRICS
+from utils.norm import freeze_batch_norm
+def make_stereo_flow(flow: Union[torch.Tensor, List[torch.Tensor]], model_out_channels: int) -> torch.Tensor:
+    """Helper function to make stereo flow from a given model output"""
+    if isinstance(flow, list):
+        return [make_stereo_flow(flow_i, model_out_channels) for flow_i in flow]
+    B, C, H, W = flow.shape
+    # we need to add zero flow if the model outputs 2 channels
+    if C == 1 and model_out_channels == 2:
+        zero_flow = torch.zeros_like(flow)
+        # by convention the flow is X-Y axis, so we need the Y flow last
+        flow = torch.cat([flow, zero_flow], dim=1)
+    return flow
+def make_lr_schedule(args: argparse.Namespace, optimizer: torch.optim.Optimizer) -> np.ndarray:
+    """Helper function to return a learning rate scheduler for CRE-stereo"""
+    if args.decay_after_steps < args.warmup_steps:
+        raise ValueError(f"decay_after_steps: {args.function} must be greater than warmup_steps: {args.warmup_steps}")
+    warmup_steps = args.warmup_steps if args.warmup_steps else 0
+    flat_lr_steps = args.decay_after_steps - warmup_steps if args.decay_after_steps else 0
+    decay_lr_steps = args.total_iterations - flat_lr_steps
+    max_lr = args.lr
+    min_lr = args.min_lr
+    schedulers = []
+    milestones = []
+    if warmup_steps > 0:
+        if args.lr_warmup_method == "linear":
+            warmup_lr_scheduler = torch.optim.lr_scheduler.LinearLR(
+                optimizer, start_factor=args.lr_warmup_factor, total_iters=warmup_steps
+            )
+        elif args.lr_warmup_method == "constant":
+            warmup_lr_scheduler = torch.optim.lr_scheduler.ConstantLR(
+                optimizer, factor=args.lr_warmup_factor, total_iters=warmup_steps
+            )
+        else:
+            raise ValueError(f"Unknown lr warmup method {args.lr_warmup_method}")
+        schedulers.append(warmup_lr_scheduler)
+        milestones.append(warmup_steps)
+    if flat_lr_steps > 0:
+        flat_lr_scheduler = torch.optim.lr_scheduler.ConstantLR(optimizer, factor=max_lr, total_iters=flat_lr_steps)
+        schedulers.append(flat_lr_scheduler)
+        milestones.append(flat_lr_steps + warmup_steps)
+    if decay_lr_steps > 0:
+        if args.lr_decay_method == "cosine":
+            decay_lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
+                optimizer, T_max=decay_lr_steps, eta_min=min_lr
+            )
+        elif args.lr_decay_method == "linear":
+            decay_lr_scheduler = torch.optim.lr_scheduler.LinearLR(
+                optimizer, start_factor=max_lr, end_factor=min_lr, total_iters=decay_lr_steps
+            )
+        elif args.lr_decay_method == "exponential":
+            decay_lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(
+                optimizer, gamma=args.lr_decay_gamma, last_epoch=-1
+            )
+        else:
+            raise ValueError(f"Unknown lr decay method {args.lr_decay_method}")
+        schedulers.append(decay_lr_scheduler)
+    scheduler = torch.optim.lr_scheduler.SequentialLR(optimizer, schedulers, milestones=milestones)
+    return scheduler
+def shuffle_dataset(dataset):
+    """Shuffle the dataset"""
+    perm = torch.randperm(len(dataset))
+    return torch.utils.data.Subset(dataset, perm)
+def resize_dataset_to_n_steps(
+    dataset: torch.utils.data.Dataset, dataset_steps: int, samples_per_step: int, args: argparse.Namespace
+) -> torch.utils.data.Dataset:
+    original_size = len(dataset)
+    if args.steps_is_epochs:
+        samples_per_step = original_size
+    target_size = dataset_steps * samples_per_step
+    dataset_copies = []
+    n_expands, remainder = divmod(target_size, original_size)
+    for idx in range(n_expands):
+        dataset_copies.append(dataset)
+    if remainder > 0:
+        dataset_copies.append(torch.utils.data.Subset(dataset, list(range(remainder))))
+    if args.dataset_shuffle:
+        dataset_copies = [shuffle_dataset(dataset_copy) for dataset_copy in dataset_copies]
+    dataset = torch.utils.data.ConcatDataset(dataset_copies)
+    return dataset
+def get_train_dataset(dataset_root: str, args: argparse.Namespace) -> torch.utils.data.Dataset:
+    datasets = []
+    for dataset_name in args.train_datasets:
+        transform = make_train_transform(args)
+        dataset = make_dataset(dataset_name, dataset_root, transform)
+        datasets.append(dataset)
+    if len(datasets) == 0:
+        raise ValueError("No datasets specified for training")
+    samples_per_step = args.world_size * args.batch_size
+    for idx, (dataset, steps_per_dataset) in enumerate(zip(datasets, args.dataset_steps)):
+        datasets[idx] = resize_dataset_to_n_steps(dataset, steps_per_dataset, samples_per_step, args)
+    dataset = torch.utils.data.ConcatDataset(datasets)
+    if args.dataset_order_shuffle:
+        dataset = shuffle_dataset(dataset)
+    print(f"Training dataset: {len(dataset)} samples")
+    return dataset
+@torch.inference_mode()
+def _evaluate(
+    model,
+    args,
+    val_loader,
+    *,
+    padder_mode,
+    print_freq=10,
+    writer=None,
+    step=None,
+    iterations=None,
+    batch_size=None,
+    header=None,
+):
+    """Helper function to compute various metrics (epe, etc.) for a model on a given dataset."""
+    model.eval()
+    header = header or "Test:"
+    device = torch.device(args.device)
+    metric_logger = utils.MetricLogger(delimiter="  ")
+    iterations = iterations or args.recurrent_updates
+    logger = utils.MetricLogger()
+    for meter_name in args.metrics:
+        logger.add_meter(meter_name, fmt="{global_avg:.4f}")
+    if "fl-all" not in args.metrics:
+        logger.add_meter("fl-all", fmt="{global_avg:.4f}")
+    num_processed_samples = 0
+    with torch.cuda.amp.autocast(enabled=args.mixed_precision, dtype=torch.float16):
+        for blob in metric_logger.log_every(val_loader, print_freq, header):
+            image_left, image_right, disp_gt, valid_disp_mask = (x.to(device) for x in blob)
+            padder = utils.InputPadder(image_left.shape, mode=padder_mode)
+            image_left, image_right = padder.pad(image_left, image_right)
+            disp_predictions = model(image_left, image_right, flow_init=None, num_iters=iterations)
+            disp_pred = disp_predictions[-1][:, :1, :, :]
+            disp_pred = padder.unpad(disp_pred)
+            metrics, _ = utils.compute_metrics(disp_pred, disp_gt, valid_disp_mask, metrics=logger.meters.keys())
+            num_processed_samples += image_left.shape[0]
+            for name in metrics:
+                logger.meters[name].update(metrics[name], n=1)
+    num_processed_samples = utils.reduce_across_processes(num_processed_samples)
+    print("Num_processed_samples: ", num_processed_samples)
+    if (
+        hasattr(val_loader.dataset, "__len__")
+        and len(val_loader.dataset) != num_processed_samples
+        and torch.distributed.get_rank() == 0
+    ):
+        warnings.warn(
+            f"Number of processed samples {num_processed_samples} is different"
+            f"from the dataset size {len(val_loader.dataset)}. This may happen if"
+            "the dataset is not divisible by the batch size. Try lowering the batch size or GPU number for more accurate results."
+        )
+    if writer is not None and args.rank == 0:
+        for meter_name, meter_value in logger.meters.items():
+            scalar_name = f"{meter_name} {header}"
+            writer.add_scalar(scalar_name, meter_value.avg, step)
+    logger.synchronize_between_processes()
+    print(header, logger)
+def make_eval_loader(dataset_name: str, args: argparse.Namespace) -> torch.utils.data.DataLoader:
+    if args.weights:
+        weights = torchvision.models.get_weight(args.weights)
+        trans = weights.transforms()
+        def preprocessing(image_left, image_right, disp, valid_disp_mask):
+            C_o, H_o, W_o = get_dimensions(image_left)
+            image_left, image_right = trans(image_left, image_right)
+            C_t, H_t, W_t = get_dimensions(image_left)
+            scale_factor = W_t / W_o
+            if disp is not None and not isinstance(disp, torch.Tensor):
+                disp = torch.from_numpy(disp)
+                if W_t != W_o:
+                    disp = resize(disp, (H_t, W_t), mode=InterpolationMode.BILINEAR) * scale_factor
+            if valid_disp_mask is not None and not isinstance(valid_disp_mask, torch.Tensor):
+                valid_disp_mask = torch.from_numpy(valid_disp_mask)
+                if W_t != W_o:
+                    valid_disp_mask = resize(valid_disp_mask, (H_t, W_t), mode=InterpolationMode.NEAREST)
+            return image_left, image_right, disp, valid_disp_mask
+    else:
+        preprocessing = make_eval_transform(args)
+    val_dataset = make_dataset(dataset_name, args.dataset_root, transforms=preprocessing)
+    if args.distributed:
+        sampler = torch.utils.data.distributed.DistributedSampler(val_dataset, shuffle=False, drop_last=False)
+    else:
+        sampler = torch.utils.data.SequentialSampler(val_dataset)
+    val_loader = torch.utils.data.DataLoader(
+        val_dataset,
+        sampler=sampler,
+        batch_size=args.batch_size,
+        pin_memory=True,
+        num_workers=args.workers,
+    )
+    return val_loader
+def evaluate(model, loaders, args, writer=None, step=None):
+    for loader_name, loader in loaders.items():
+        _evaluate(
+            model,
+            args,
+            loader,
+            iterations=args.recurrent_updates,
+            padder_mode=args.padder_type,
+            header=f"{loader_name} evaluation",
+            batch_size=args.batch_size,
+            writer=writer,
+            step=step,
+        )
+def run(model, optimizer, scheduler, train_loader, val_loaders, logger, writer, scaler, args):
+    device = torch.device(args.device)
+    # wrap the loader in a logger
+    loader = iter(logger.log_every(train_loader))
+    # output channels
+    model_out_channels = model.module.output_channels if args.distributed else model.output_channels
+    torch.set_num_threads(args.threads)
+    sequence_criterion = utils.SequenceLoss(
+        gamma=args.gamma,
+        max_flow=args.max_disparity,
+        exclude_large_flows=args.flow_loss_exclude_large,
+    ).to(device)
+    if args.consistency_weight:
+        consistency_criterion = utils.FlowSequenceConsistencyLoss(
+            args.gamma,
+            resize_factor=0.25,
+            rescale_factor=0.25,
+            rescale_mode="bilinear",
+        ).to(device)
+    else:
+        consistency_criterion = None
+    if args.psnr_weight:
+        psnr_criterion = utils.PSNRLoss().to(device)
+    else:
+        psnr_criterion = None
+    if args.smoothness_weight:
+        smoothness_criterion = utils.SmoothnessLoss().to(device)
+    else:
+        smoothness_criterion = None
+    if args.photometric_weight:
+        photometric_criterion = utils.FlowPhotoMetricLoss(
+            ssim_weight=args.photometric_ssim_weight,
+            max_displacement_ratio=args.photometric_max_displacement_ratio,
+            ssim_use_padding=False,
+        ).to(device)
+    else:
+        photometric_criterion = None
+    for step in range(args.start_step + 1, args.total_iterations + 1):
+        data_blob = next(loader)
+        optimizer.zero_grad()
+        # unpack the data blob
+        image_left, image_right, disp_mask, valid_disp_mask = (x.to(device) for x in data_blob)
+        with torch.cuda.amp.autocast(enabled=args.mixed_precision, dtype=torch.float16):
+            disp_predictions = model(image_left, image_right, flow_init=None, num_iters=args.recurrent_updates)
+            # different models have different outputs, make sure we get the right ones for this task
+            disp_predictions = make_stereo_flow(disp_predictions, model_out_channels)
+            # should the architecture or training loop require it, we have to adjust the disparity mask
+            # target to possibly look like an optical flow mask
+            disp_mask = make_stereo_flow(disp_mask, model_out_channels)
+            # sequence loss on top of the model outputs
+        loss = sequence_criterion(disp_predictions, disp_mask, valid_disp_mask) * args.flow_loss_weight
+        if args.consistency_weight > 0:
+            loss_consistency = consistency_criterion(disp_predictions)
+            loss += loss_consistency * args.consistency_weight
+        if args.psnr_weight > 0:
+            loss_psnr = 0.0
+            for pred in disp_predictions:
+                # predictions might have 2 channels
+                loss_psnr += psnr_criterion(
+                    pred * valid_disp_mask.unsqueeze(1),
+                    disp_mask * valid_disp_mask.unsqueeze(1),
+                ).mean()  # mean the psnr loss over the batch
+            loss += loss_psnr / len(disp_predictions) * args.psnr_weight
+        if args.photometric_weight > 0:
+            loss_photometric = 0.0
+            for pred in disp_predictions:
+                # predictions might have 1 channel, therefore we need to inpute 0s for the second channel
+                if model_out_channels == 1:
+                    pred = torch.cat([pred, torch.zeros_like(pred)], dim=1)
+                loss_photometric += photometric_criterion(
+                    image_left, image_right, pred, valid_disp_mask
+                )  # photometric loss already comes out meaned over the batch
+            loss += loss_photometric / len(disp_predictions) * args.photometric_weight
+        if args.smoothness_weight > 0:
+            loss_smoothness = 0.0
+            for pred in disp_predictions:
+                # predictions might have 2 channels
+                loss_smoothness += smoothness_criterion(
+                    image_left, pred[:, :1, :, :]
+                ).mean()  # mean the smoothness loss over the batch
+            loss += loss_smoothness / len(disp_predictions) * args.smoothness_weight
+        with torch.no_grad():
+            metrics, _ = utils.compute_metrics(
+                disp_predictions[-1][:, :1, :, :],  # predictions might have 2 channels
+                disp_mask[:, :1, :, :],  # so does the ground truth
+                valid_disp_mask,
+                args.metrics,
+            )
+        metrics.pop("fl-all", None)
+        logger.update(loss=loss, **metrics)
+        if scaler is not None:
+            scaler.scale(loss).backward()
+            scaler.unscale_(optimizer)
+            if args.clip_grad_norm:
+                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=args.clip_grad_norm)
+            scaler.step(optimizer)
+            scaler.update()
+        else:
+            loss.backward()
+            if args.clip_grad_norm:
+                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=args.clip_grad_norm)
+            optimizer.step()
+        scheduler.step()
+        if not dist.is_initialized() or dist.get_rank() == 0:
+            if writer is not None and step % args.tensorboard_log_frequency == 0:
+                # log the loss and metrics to tensorboard
+                writer.add_scalar("loss", loss, step)
+                for name, value in logger.meters.items():
+                    writer.add_scalar(name, value.avg, step)
+                # log the images to tensorboard
+                pred_grid = visualization.make_training_sample_grid(
+                    image_left, image_right, disp_mask, valid_disp_mask, disp_predictions
+                )
+                writer.add_image("predictions", pred_grid, step, dataformats="HWC")
+                # second thing we want to see is how relevant the iterative refinement is
+                pred_sequence_grid = visualization.make_disparity_sequence_grid(disp_predictions, disp_mask)
+                writer.add_image("sequence", pred_sequence_grid, step, dataformats="HWC")
+        if step % args.save_frequency == 0:
+            if not args.distributed or args.rank == 0:
+                model_without_ddp = (
+                    model.module if isinstance(model, torch.nn.parallel.DistributedDataParallel) else model
+                )
+                checkpoint = {
+                    "model": model_without_ddp.state_dict(),
+                    "optimizer": optimizer.state_dict(),
+                    "scheduler": scheduler.state_dict(),
+                    "step": step,
+                    "args": args,
+                }
+                os.makedirs(args.checkpoint_dir, exist_ok=True)
+                torch.save(checkpoint, Path(args.checkpoint_dir) / f"{args.name}_{step}.pth")
+                torch.save(checkpoint, Path(args.checkpoint_dir) / f"{args.name}.pth")
+        if step % args.valid_frequency == 0:
+            evaluate(model, val_loaders, args, writer, step)
+            model.train()
+            if args.freeze_batch_norm:
+                if isinstance(model, nn.parallel.DistributedDataParallel):
+                    freeze_batch_norm(model.module)
+                else:
+                    freeze_batch_norm(model)
+    # one final save at the end
+    if not args.distributed or args.rank == 0:
+        model_without_ddp = model.module if isinstance(model, torch.nn.parallel.DistributedDataParallel) else model
+        checkpoint = {
+            "model": model_without_ddp.state_dict(),
+            "optimizer": optimizer.state_dict(),
+            "scheduler": scheduler.state_dict(),
+            "step": step,
+            "args": args,
+        }
+        os.makedirs(args.checkpoint_dir, exist_ok=True)
+        torch.save(checkpoint, Path(args.checkpoint_dir) / f"{args.name}_{step}.pth")
+        torch.save(checkpoint, Path(args.checkpoint_dir) / f"{args.name}.pth")
+def main(args):
+    args.total_iterations = sum(args.dataset_steps)
+    # initialize DDP setting
+    utils.setup_ddp(args)
+    print(args)
+    args.test_only = args.train_datasets is None
+    # set the appropriate devices
+    if args.distributed and args.device == "cpu":
+        raise ValueError("The device must be cuda if we want to run in distributed mode using torchrun")
+    device = torch.device(args.device)
+    # select model architecture
+    model = torchvision.prototype.models.depth.stereo.__dict__[args.model](weights=args.weights)
+    # convert to DDP if need be
+    if args.distributed:
+        model = model.to(args.gpu)
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
+        model_without_ddp = model.module
+    else:
+        model.to(device)
+        model_without_ddp = model
+    os.makedirs(args.checkpoint_dir, exist_ok=True)
+    val_loaders = {name: make_eval_loader(name, args) for name in args.test_datasets}
+    # EVAL ONLY configurations
+    if args.test_only:
+        evaluate(model, val_loaders, args)
+        return
+    # Sanity check for the parameter count
+    print(f"Parameter Count: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")
+    # Compose the training dataset
+    train_dataset = get_train_dataset(args.dataset_root, args)
+    # initialize the optimizer
+    if args.optimizer == "adam":
+        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
+    elif args.optimizer == "sgd":
+        optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, weight_decay=args.weight_decay, momentum=0.9)
+    else:
+        raise ValueError(f"Unknown optimizer {args.optimizer}. Please choose between adam and sgd")
+    # initialize the learning rate schedule
+    scheduler = make_lr_schedule(args, optimizer)
+    # load them from checkpoint if needed
+    args.start_step = 0
+    if args.resume_path is not None:
+        checkpoint = torch.load(args.resume_path, map_location="cpu")
+        if "model" in checkpoint:
+            # this means the user requested to resume from a training checkpoint
+            model_without_ddp.load_state_dict(checkpoint["model"])
+            # this means the user wants to continue training from where it was left off
+            if args.resume_schedule:
+                optimizer.load_state_dict(checkpoint["optimizer"])
+                scheduler.load_state_dict(checkpoint["scheduler"])
+                args.start_step = checkpoint["step"] + 1
+                # modify starting point of the dat
+                sample_start_step = args.start_step * args.batch_size * args.world_size
+                train_dataset = train_dataset[sample_start_step:]
+        else:
+            # this means the user wants to finetune on top of a model state dict
+            # and that no other changes are required
+            model_without_ddp.load_state_dict(checkpoint)
+    torch.backends.cudnn.benchmark = True
+    # enable training mode
+    model.train()
+    if args.freeze_batch_norm:
+        freeze_batch_norm(model_without_ddp)
+    # put dataloader on top of the dataset
+    # make sure to disable shuffling since the dataset is already shuffled
+    # in order to guarantee quasi randomness whilst retaining a deterministic
+    # dataset consumption order
+    if args.distributed:
+        # the train dataset is preshuffled in order to respect the iteration order
+        sampler = torch.utils.data.distributed.DistributedSampler(train_dataset, shuffle=False, drop_last=True)
+    else:
+        # the train dataset is already shuffled, so we can use a simple SequentialSampler
+        sampler = torch.utils.data.SequentialSampler(train_dataset)
+    train_loader = torch.utils.data.DataLoader(
+        train_dataset,
+        sampler=sampler,
+        batch_size=args.batch_size,
+        pin_memory=True,
+        num_workers=args.workers,
+    )
+    # initialize the logger
+    if args.tensorboard_summaries:
+        from torch.utils.tensorboard import SummaryWriter
+        tensorboard_path = Path(args.checkpoint_dir) / "tensorboard"
+        os.makedirs(tensorboard_path, exist_ok=True)
+        tensorboard_run = tensorboard_path / f"{args.name}"
+        writer = SummaryWriter(tensorboard_run)
+    else:
+        writer = None
+    logger = utils.MetricLogger(delimiter="  ")
+    scaler = torch.cuda.amp.GradScaler() if args.mixed_precision else None
+    # run the training loop
+    # this will perform optimization, respectively logging and saving checkpoints
+    # when need be
+    run(
+        model=model,
+        optimizer=optimizer,
+        scheduler=scheduler,
+        train_loader=train_loader,
+        val_loaders=val_loaders,
+        logger=logger,
+        writer=writer,
+        scaler=scaler,
+        args=args,
+    )
+def get_args_parser(add_help=True):
+    import argparse
+    parser = argparse.ArgumentParser(description="PyTorch Stereo Matching Training", add_help=add_help)
+    # checkpointing
+    parser.add_argument("--name", default="crestereo", help="name of the experiment")
+    parser.add_argument("--resume", type=str, default=None, help="from which checkpoint to resume")
+    parser.add_argument("--checkpoint-dir", type=str, default="checkpoints", help="path to the checkpoint directory")
+    # dataset
+    parser.add_argument("--dataset-root", type=str, default="", help="path to the dataset root directory")
+    parser.add_argument(
+        "--train-datasets",
+        type=str,
+        nargs="+",
+        default=["crestereo"],
+        help="dataset(s) to train on",
+        choices=list(VALID_DATASETS.keys()),
+    )
+    parser.add_argument(
+        "--dataset-steps", type=int, nargs="+", default=[300_000], help="number of steps for each dataset"
+    )
+    parser.add_argument(
+        "--steps-is-epochs", action="store_true", help="if set, dataset-steps are interpreted as epochs"
+    )
+    parser.add_argument(
+        "--test-datasets",
+        type=str,
+        nargs="+",
+        default=["middlebury2014-train"],
+        help="dataset(s) to test on",
+        choices=["middlebury2014-train"],
+    )
+    parser.add_argument("--dataset-shuffle", type=bool, help="shuffle the dataset", default=True)
+    parser.add_argument("--dataset-order-shuffle", type=bool, help="shuffle the dataset order", default=True)
+    parser.add_argument("--batch-size", type=int, default=2, help="batch size per GPU")
+    parser.add_argument("--workers", type=int, default=4, help="number of workers per GPU")
+    parser.add_argument(
+        "--threads",
+        type=int,
+        default=16,
+        help="number of CPU threads per GPU. This can be changed around to speed-up transforms if needed. This can lead to worker thread contention so use with care.",
+    )
+    # model architecture
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="crestereo_base",
+        help="model architecture",
+        choices=["crestereo_base", "raft_stereo"],
+    )
+    parser.add_argument("--recurrent-updates", type=int, default=10, help="number of recurrent updates")
+    parser.add_argument("--freeze-batch-norm", action="store_true", help="freeze batch norm parameters")
+    # loss parameters
+    parser.add_argument("--gamma", type=float, default=0.8, help="gamma parameter for the flow sequence loss")
+    parser.add_argument("--flow-loss-weight", type=float, default=1.0, help="weight for the flow loss")
+    parser.add_argument(
+        "--flow-loss-exclude-large",
+        action="store_true",
+        help="exclude large flow values from the loss. A large value is defined as a value greater than the ground truth flow norm",
+        default=False,
+    )
+    parser.add_argument("--consistency-weight", type=float, default=0.0, help="consistency loss weight")
+    parser.add_argument(
+        "--consistency-resize-factor",
+        type=float,
+        default=0.25,
+        help="consistency loss resize factor to account for the fact that the flow is computed on a downsampled image",
+    )
+    parser.add_argument("--psnr-weight", type=float, default=0.0, help="psnr loss weight")
+    parser.add_argument("--smoothness-weight", type=float, default=0.0, help="smoothness loss weight")
+    parser.add_argument("--photometric-weight", type=float, default=0.0, help="photometric loss weight")
+    parser.add_argument(
+        "--photometric-max-displacement-ratio",
+        type=float,
+        default=0.15,
+        help="Only pixels with a displacement smaller than this ratio of the image width will be considered for the photometric loss",
+    )
+    parser.add_argument("--photometric-ssim-weight", type=float, default=0.85, help="photometric ssim loss weight")
+    # transforms parameters
+    parser.add_argument("--gpu-transforms", action="store_true", help="use GPU transforms")
+    parser.add_argument(
+        "--eval-size", type=int, nargs="+", default=[384, 512], help="size of the images for evaluation"
+    )
+    parser.add_argument("--resize-size", type=int, nargs=2, default=None, help="resize size")
+    parser.add_argument("--crop-size", type=int, nargs=2, default=[384, 512], help="crop size")
+    parser.add_argument("--scale-range", type=float, nargs=2, default=[0.6, 1.0], help="random scale range")
+    parser.add_argument("--rescale-prob", type=float, default=1.0, help="probability of resizing the image")
+    parser.add_argument(
+        "--scaling-type", type=str, default="linear", help="scaling type", choices=["exponential", "linear"]
+    )
+    parser.add_argument("--flip-prob", type=float, default=0.5, help="probability of flipping the image")
+    parser.add_argument(
+        "--norm-mean", type=float, nargs="+", default=[0.5, 0.5, 0.5], help="mean for image normalization"
+    )
+    parser.add_argument(
+        "--norm-std", type=float, nargs="+", default=[0.5, 0.5, 0.5], help="std for image normalization"
+    )
+    parser.add_argument(
+        "--use-grayscale", action="store_true", help="use grayscale images instead of RGB", default=False
+    )
+    parser.add_argument("--max-disparity", type=float, default=None, help="maximum disparity")
+    parser.add_argument(
+        "--interpolation-strategy",
+        type=str,
+        default="bilinear",
+        help="interpolation strategy",
+        choices=["bilinear", "bicubic", "mixed"],
+    )
+    parser.add_argument("--spatial-shift-prob", type=float, default=1.0, help="probability of shifting the image")
+    parser.add_argument(
+        "--spatial-shift-max-angle", type=float, default=0.1, help="maximum angle for the spatial shift"
+    )
+    parser.add_argument(
+        "--spatial-shift-max-displacement", type=float, default=2.0, help="maximum displacement for the spatial shift"
+    )
+    parser.add_argument("--gamma-range", type=float, nargs="+", default=[0.8, 1.2], help="range for gamma correction")
+    parser.add_argument(
+        "--brightness-range", type=float, nargs="+", default=[0.8, 1.2], help="range for brightness correction"
+    )
+    parser.add_argument(
+        "--contrast-range", type=float, nargs="+", default=[0.8, 1.2], help="range for contrast correction"
+    )
+    parser.add_argument(
+        "--saturation-range", type=float, nargs="+", default=0.0, help="range for saturation correction"
+    )
+    parser.add_argument("--hue-range", type=float, nargs="+", default=0.0, help="range for hue correction")
+    parser.add_argument(
+        "--asymmetric-jitter-prob",
+        type=float,
+        default=1.0,
+        help="probability of using asymmetric jitter instead of symmetric jitter",
+    )
+    parser.add_argument("--occlusion-prob", type=float, default=0.5, help="probability of occluding the rightimage")
+    parser.add_argument(
+        "--occlusion-px-range", type=int, nargs="+", default=[50, 100], help="range for the number of occluded pixels"
+    )
+    parser.add_argument("--erase-prob", type=float, default=0.0, help="probability of erasing in both images")
+    parser.add_argument(
+        "--erase-px-range", type=int, nargs="+", default=[50, 100], help="range for the number of erased pixels"
+    )
+    parser.add_argument(
+        "--erase-num-repeats", type=int, default=1, help="number of times to repeat the erase operation"
+    )
+    # optimizer parameters
+    parser.add_argument("--optimizer", type=str, default="adam", help="optimizer", choices=["adam", "sgd"])
+    parser.add_argument("--lr", type=float, default=4e-4, help="learning rate")
+    parser.add_argument("--weight-decay", type=float, default=0.0, help="weight decay")
+    parser.add_argument("--clip-grad-norm", type=float, default=0.0, help="clip grad norm")
+    # lr_scheduler parameters
+    parser.add_argument("--min-lr", type=float, default=2e-5, help="minimum learning rate")
+    parser.add_argument("--warmup-steps", type=int, default=6_000, help="number of warmup steps")
+    parser.add_argument(
+        "--decay-after-steps", type=int, default=180_000, help="number of steps after which to start decay the lr"
+    )
+    parser.add_argument(
+        "--lr-warmup-method", type=str, default="linear", help="warmup method", choices=["linear", "cosine"]
+    )
+    parser.add_argument("--lr-warmup-factor", type=float, default=0.02, help="warmup factor for the learning rate")
+    parser.add_argument(
+        "--lr-decay-method",
+        type=str,
+        default="linear",
+        help="decay method",
+        choices=["linear", "cosine", "exponential"],
+    )
+    parser.add_argument("--lr-decay-gamma", type=float, default=0.8, help="decay factor for the learning rate")
+    # deterministic behaviour
+    parser.add_argument("--seed", type=int, default=42, help="seed for random number generators")
+    # mixed precision training
+    parser.add_argument("--mixed-precision", action="store_true", help="use mixed precision training")
+    # logging
+    parser.add_argument("--tensorboard-summaries", action="store_true", help="log to tensorboard")
+    parser.add_argument("--tensorboard-log-frequency", type=int, default=100, help="log frequency")
+    parser.add_argument("--save-frequency", type=int, default=1_000, help="save frequency")
+    parser.add_argument("--valid-frequency", type=int, default=1_000, help="validation frequency")
+    parser.add_argument(
+        "--metrics",
+        type=str,
+        nargs="+",
+        default=["mae", "rmse", "1px", "3px", "5px", "relepe"],
+        help="metrics to log",
+        choices=AVAILABLE_METRICS,
+    )
+    # distributed parameters
+    parser.add_argument("--world-size", type=int, default=8, help="number of distributed processes")
+    parser.add_argument("--dist-url", type=str, default="env://", help="url used to set up distributed training")
+    parser.add_argument("--device", type=str, default="cuda", help="device to use for training")
+    # weights API
+    parser.add_argument("--weights", type=str, default=None, help="weights API url")
+    parser.add_argument(
+        "--resume-path", type=str, default=None, help="a path from which to resume or start fine-tuning"
+    )
+    parser.add_argument("--resume-schedule", action="store_true", help="resume optimizer state")
+    # padder parameters
+    parser.add_argument("--padder-type", type=str, default="kitti", help="padder type", choices=["kitti", "sintel"])
+    return parser
+if __name__ == "__main__":
+    args = get_args_parser().parse_args()
+    main(args)
--- a/references/depth/stereo/transforms.py
+++ b/references/depth/stereo/transforms.py
+import random
+from typing import Callable, List, Optional, Sequence, Tuple, Union
+import numpy as np
+import PIL.Image
+import torch
+import torchvision.transforms as T
+import torchvision.transforms.functional as F
+from torch import Tensor
+T_FLOW = Union[Tensor, np.ndarray, None]
+T_MASK = Union[Tensor, np.ndarray, None]
+T_STEREO_TENSOR = Tuple[Tensor, Tensor]
+T_COLOR_AUG_PARAM = Union[float, Tuple[float, float]]
+def rand_float_range(size: Sequence[int], low: float, high: float) -> Tensor:
+    return (low - high) * torch.rand(size) + high
+class InterpolationStrategy:
+    _valid_modes: List[str] = ["mixed", "bicubic", "bilinear"]
+    def __init__(self, mode: str = "mixed") -> None:
+        if mode not in self._valid_modes:
+            raise ValueError(f"Invalid interpolation mode: {mode}. Valid modes are: {self._valid_modes}")
+        if mode == "mixed":
+            self.strategies = [F.InterpolationMode.BILINEAR, F.InterpolationMode.BICUBIC]
+        elif mode == "bicubic":
+            self.strategies = [F.InterpolationMode.BICUBIC]
+        elif mode == "bilinear":
+            self.strategies = [F.InterpolationMode.BILINEAR]
+    def __call__(self) -> F.InterpolationMode:
+        return random.choice(self.strategies)
+    @classmethod
+    def is_valid(mode: str) -> bool:
+        return mode in InterpolationStrategy._valid_modes
+    @property
+    def valid_modes() -> List[str]:
+        return InterpolationStrategy._valid_modes
+class ValidateModelInput(torch.nn.Module):
+    # Pass-through transform that checks the shape and dtypes to make sure the model gets what it expects
+    def forward(self, images: T_STEREO_TENSOR, disparities: T_FLOW, masks: T_MASK):
+        if images[0].shape != images[1].shape:
+            raise ValueError("img1 and img2 should have the same shape.")
+        h, w = images[0].shape[-2:]
+        if disparities[0] is not None and disparities[0].shape != (1, h, w):
+            raise ValueError(f"disparities[0].shape should be (1, {h}, {w}) instead of {disparities[0].shape}")
+        if masks[0] is not None:
+            if masks[0].shape != (h, w):
+                raise ValueError(f"masks[0].shape should be ({h}, {w}) instead of {masks[0].shape}")
+            if masks[0].dtype != torch.bool:
+                raise TypeError(f"masks[0] should be of dtype torch.bool instead of {masks[0].dtype}")
+        return images, disparities, masks
+class ConvertToGrayscale(torch.nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+    def forward(
+        self,
+        images: Tuple[PIL.Image.Image, PIL.Image.Image],
+        disparities: Tuple[T_FLOW, T_FLOW],
+        masks: Tuple[T_MASK, T_MASK],
+    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
+        img_left = F.rgb_to_grayscale(images[0], num_output_channels=3)
+        img_right = F.rgb_to_grayscale(images[1], num_output_channels=3)
+        return (img_left, img_right), disparities, masks
+class MakeValidDisparityMask(torch.nn.Module):
+    def __init__(self, max_disparity: Optional[int] = 256) -> None:
+        super().__init__()
+        self.max_disparity = max_disparity
+    def forward(
+        self,
+        images: T_STEREO_TENSOR,
+        disparities: Tuple[T_FLOW, T_FLOW],
+        masks: Tuple[T_MASK, T_MASK],
+    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
+        valid_masks = tuple(
+            torch.ones(images[idx].shape[-2:], dtype=torch.bool, device=images[idx].device) if mask is None else mask
+            for idx, mask in enumerate(masks)
+        )
+        valid_masks = tuple(
+            torch.logical_and(mask, disparity > 0).squeeze(0) if disparity is not None else mask
+            for mask, disparity in zip(valid_masks, disparities)
+        )
+        if self.max_disparity is not None:
+            valid_masks = tuple(
+                torch.logical_and(mask, disparity < self.max_disparity).squeeze(0) if disparity is not None else mask
+                for mask, disparity in zip(valid_masks, disparities)
+            )
+        return images, disparities, valid_masks
+class ToGPU(torch.nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+    def forward(
+        self,
+        images: T_STEREO_TENSOR,
+        disparities: Tuple[T_FLOW, T_FLOW],
+        masks: Tuple[T_MASK, T_MASK],
+    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
+        dev_images = tuple(image.cuda() for image in images)
+        dev_disparities = tuple(map(lambda x: x.cuda() if x is not None else None, disparities))
+        dev_masks = tuple(map(lambda x: x.cuda() if x is not None else None, masks))
+        return dev_images, dev_disparities, dev_masks
+class ConvertImageDtype(torch.nn.Module):
+    def __init__(self, dtype: torch.dtype):
+        super().__init__()
+        self.dtype = dtype
+    def forward(
+        self,
+        images: T_STEREO_TENSOR,
+        disparities: Tuple[T_FLOW, T_FLOW],
+        masks: Tuple[T_MASK, T_MASK],
+    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
+        img_left = F.convert_image_dtype(images[0], dtype=self.dtype)
+        img_right = F.convert_image_dtype(images[1], dtype=self.dtype)
+        img_left = img_left.contiguous()
+        img_right = img_right.contiguous()
+        return (img_left, img_right), disparities, masks
+class Normalize(torch.nn.Module):
+    def __init__(self, mean: List[float], std: List[float]) -> None:
+        super().__init__()
+        self.mean = mean
+        self.std = std
+    def forward(
+        self,
+        images: T_STEREO_TENSOR,
+        disparities: Tuple[T_FLOW, T_FLOW],
+        masks: Tuple[T_MASK, T_MASK],
+    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
+        img_left = F.normalize(images[0], mean=self.mean, std=self.std)
+        img_right = F.normalize(images[1], mean=self.mean, std=self.std)
+        img_left = img_left.contiguous()
+        img_right = img_right.contiguous()
+        return (img_left, img_right), disparities, masks
+class ToTensor(torch.nn.Module):
+    def forward(
+        self,
+        images: Tuple[PIL.Image.Image, PIL.Image.Image],
+        disparities: Tuple[T_FLOW, T_FLOW],
+        masks: Tuple[T_MASK, T_MASK],
+    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
+        if images[0] is None:
+            raise ValueError("img_left is None")
+        if images[1] is None:
+            raise ValueError("img_right is None")
+        img_left = F.pil_to_tensor(images[0])
+        img_right = F.pil_to_tensor(images[1])
+        disparity_tensors = ()
+        mask_tensors = ()
+        for idx in range(2):
+            disparity_tensors += (torch.from_numpy(disparities[idx]),) if disparities[idx] is not None else (None,)
+            mask_tensors += (torch.from_numpy(masks[idx]),) if masks[idx] is not None else (None,)
+        return (img_left, img_right), disparity_tensors, mask_tensors
+class AsymmetricColorJitter(T.ColorJitter):
+    # p determines the probability of doing asymmetric vs symmetric color jittering
+    def __init__(
+        self,
+        brightness: T_COLOR_AUG_PARAM = 0,
+        contrast: T_COLOR_AUG_PARAM = 0,
+        saturation: T_COLOR_AUG_PARAM = 0,
+        hue: T_COLOR_AUG_PARAM = 0,
+        p: float = 0.2,
+    ):
+        super().__init__(brightness=brightness, contrast=contrast, saturation=saturation, hue=hue)
+        self.p = p
+    def forward(
+        self,
+        images: T_STEREO_TENSOR,
+        disparities: Tuple[T_FLOW, T_FLOW],
+        masks: Tuple[T_MASK, T_MASK],
+    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
+        if torch.rand(1) < self.p:
+            # asymmetric: different transform for img1 and img2
+            img_left = super().forward(images[0])
+            img_right = super().forward(images[1])
+        else:
+            # symmetric: same transform for img1 and img2
+            batch = torch.stack(images)
+            batch = super().forward(batch)
+            img_left, img_right = batch[0], batch[1]
+        return (img_left, img_right), disparities, masks
+class AsymetricGammaAdjust(torch.nn.Module):
+    def __init__(self, p: float, gamma_range: Tuple[float, float], gain: float = 1) -> None:
+        super().__init__()
+        self.gamma_range = gamma_range
+        self.gain = gain
+        self.p = p
+    def forward(
+        self,
+        images: T_STEREO_TENSOR,
+        disparities: Tuple[T_FLOW, T_FLOW],
+        masks: Tuple[T_MASK, T_MASK],
+    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
+        gamma = rand_float_range((1,), low=self.gamma_range[0], high=self.gamma_range[1]).item()
+        if torch.rand(1) < self.p:
+            # asymmetric: different transform for img1 and img2
+            img_left = F.adjust_gamma(images[0], gamma, gain=self.gain)
+            img_right = F.adjust_gamma(images[1], gamma, gain=self.gain)
+        else:
+            # symmetric: same transform for img1 and img2
+            batch = torch.stack(images)
+            batch = F.adjust_gamma(batch, gamma, gain=self.gain)
+            img_left, img_right = batch[0], batch[1]
+        return (img_left, img_right), disparities, masks
+class RandomErase(torch.nn.Module):
+    # Produces multiple symmetric random erasures
+    # these can be viewed as occlusions present in both camera views.
+    # Similarly to Optical Flow occlusion prediction tasks, we mask these pixels in the disparity map
+    def __init__(
+        self,
+        p: float = 0.5,
+        erase_px_range: Tuple[int, int] = (50, 100),
+        value: Union[Tensor, float] = 0,
+        inplace: bool = False,
+        max_erase: int = 2,
+    ):
+        super().__init__()
+        self.min_px_erase = erase_px_range[0]
+        self.max_px_erase = erase_px_range[1]
+        if self.max_px_erase < 0:
+            raise ValueError("erase_px_range[1] should be equal or greater than 0")
+        if self.min_px_erase < 0:
+            raise ValueError("erase_px_range[0] should be equal or greater than 0")
+        if self.min_px_erase > self.max_px_erase:
+            raise ValueError("erase_prx_range[0] should be equal or lower than erase_px_range[1]")
+        self.p = p
+        self.value = value
+        self.inplace = inplace
+        self.max_erase = max_erase
+    def forward(
+        self,
+        images: T_STEREO_TENSOR,
+        disparities: T_STEREO_TENSOR,
+        masks: T_STEREO_TENSOR,
+    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
+        if torch.rand(1) < self.p:
+            return images, disparities, masks
+        image_left, image_right = images
+        mask_left, mask_right = masks
+        for _ in range(torch.randint(self.max_erase, size=(1,)).item()):
+            y, x, h, w, v = self._get_params(image_left)
+            image_right = F.erase(image_right, y, x, h, w, v, self.inplace)
+            image_left = F.erase(image_left, y, x, h, w, v, self.inplace)
+            # similarly to optical flow occlusion prediction, we consider
+            # any erasure pixels that are in both images to be occluded therefore
+            # we mark them as invalid
+            if mask_left is not None:
+                mask_left = F.erase(mask_left, y, x, h, w, False, self.inplace)
+            if mask_right is not None:
+                mask_right = F.erase(mask_right, y, x, h, w, False, self.inplace)
+        return (image_left, image_right), disparities, (mask_left, mask_right)
+    def _get_params(self, img: torch.Tensor) -> Tuple[int, int, int, int, float]:
+        img_h, img_w = img.shape[-2:]
+        crop_h, crop_w = (
+            random.randint(self.min_px_erase, self.max_px_erase),
+            random.randint(self.min_px_erase, self.max_px_erase),
+        )
+        crop_x, crop_y = (random.randint(0, img_w - crop_w), random.randint(0, img_h - crop_h))
+        return crop_y, crop_x, crop_h, crop_w, self.value
+class RandomOcclusion(torch.nn.Module):
+    # This adds an occlusion in the right image
+    # the occluded patch works as a patch erase where the erase value is the mean
+    # of the pixels from the selected zone
+    def __init__(self, p: float = 0.5, occlusion_px_range: Tuple[int, int] = (50, 100), inplace: bool = False):
+        super().__init__()
+        self.min_px_occlusion = occlusion_px_range[0]
+        self.max_px_occlusion = occlusion_px_range[1]
+        if self.max_px_occlusion < 0:
+            raise ValueError("occlusion_px_range[1] should be greater or equal than 0")
+        if self.min_px_occlusion < 0:
+            raise ValueError("occlusion_px_range[0] should be greater or equal than 0")
+        if self.min_px_occlusion > self.max_px_occlusion:
+            raise ValueError("occlusion_px_range[0] should be lower than occlusion_px_range[1]")
+        self.p = p
+        self.inplace = inplace
+    def forward(
+        self,
+        images: T_STEREO_TENSOR,
+        disparities: T_STEREO_TENSOR,
+        masks: T_STEREO_TENSOR,
+    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
+        left_image, right_image = images
+        if torch.rand(1) < self.p:
+            return images, disparities, masks
+        y, x, h, w, v = self._get_params(right_image)
+        right_image = F.erase(right_image, y, x, h, w, v, self.inplace)
+        return ((left_image, right_image), disparities, masks)
+    def _get_params(self, img: torch.Tensor) -> Tuple[int, int, int, int, float]:
+        img_h, img_w = img.shape[-2:]
+        crop_h, crop_w = (
+            random.randint(self.min_px_occlusion, self.max_px_occlusion),
+            random.randint(self.min_px_occlusion, self.max_px_occlusion),
+        )
+        crop_x, crop_y = (random.randint(0, img_w - crop_w), random.randint(0, img_h - crop_h))
+        occlusion_value = img[..., crop_y : crop_y + crop_h, crop_x : crop_x + crop_w].mean(dim=(-2, -1), keepdim=True)
+        return (crop_y, crop_x, crop_h, crop_w, occlusion_value)
+class RandomSpatialShift(torch.nn.Module):
+    # This transform applies a vertical shift and a slight angle rotation and the same time
+    def __init__(
+        self, p: float = 0.5, max_angle: float = 0.1, max_px_shift: int = 2, interpolation_type: str = "bilinear"
+    ) -> None:
+        super().__init__()
+        self.p = p
+        self.max_angle = max_angle
+        self.max_px_shift = max_px_shift
+        self._interpolation_mode_strategy = InterpolationStrategy(interpolation_type)
+    def forward(
+        self,
+        images: T_STEREO_TENSOR,
+        disparities: T_STEREO_TENSOR,
+        masks: T_STEREO_TENSOR,
+    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
+        # the transform is applied only on the right image
+        # in order to mimic slight calibration issues
+        img_left, img_right = images
+        INTERP_MODE = self._interpolation_mode_strategy()
+        if torch.rand(1) < self.p:
+            # [0, 1] -> [-a, a]
+            shift = rand_float_range((1,), low=-self.max_px_shift, high=self.max_px_shift).item()
+            angle = rand_float_range((1,), low=-self.max_angle, high=self.max_angle).item()
+            # sample center point for the rotation matrix
+            y = torch.randint(size=(1,), low=0, high=img_right.shape[-2]).item()
+            x = torch.randint(size=(1,), low=0, high=img_right.shape[-1]).item()
+            # apply affine transformations
+            img_right = F.affine(
+                img_right,
+                angle=angle,
+                translate=[0, shift],  # translation only on the y-axis
+                center=[x, y],
+                scale=1.0,
+                shear=0.0,
+                interpolation=INTERP_MODE,
+            )
+        return ((img_left, img_right), disparities, masks)
+class RandomHorizontalFlip(torch.nn.Module):
+    def __init__(self, p: float = 0.5) -> None:
+        super().__init__()
+        self.p = p
+    def forward(
+        self,
+        images: T_STEREO_TENSOR,
+        disparities: Tuple[T_FLOW, T_FLOW],
+        masks: Tuple[T_MASK, T_MASK],
+    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
+        img_left, img_right = images
+        dsp_left, dsp_right = disparities
+        mask_left, mask_right = masks
+        if dsp_right is not None and torch.rand(1) < self.p:
+            img_left, img_right = F.hflip(img_left), F.hflip(img_right)
+            dsp_left, dsp_right = F.hflip(dsp_left), F.hflip(dsp_right)
+            if mask_left is not None and mask_right is not None:
+                mask_left, mask_right = F.hflip(mask_left), F.hflip(mask_right)
+            return ((img_right, img_left), (dsp_right, dsp_left), (mask_right, mask_left))
+        return images, disparities, masks
+class Resize(torch.nn.Module):
+    def __init__(self, resize_size: Tuple[int, ...], interpolation_type: str = "bilinear") -> None:
+        super().__init__()
+        self.resize_size = list(resize_size)  # doing this to keep mypy happy
+        self._interpolation_mode_strategy = InterpolationStrategy(interpolation_type)
+    def forward(
+        self,
+        images: T_STEREO_TENSOR,
+        disparities: Tuple[T_FLOW, T_FLOW],
+        masks: Tuple[T_MASK, T_MASK],
+    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
+        resized_images = ()
+        resized_disparities = ()
+        resized_masks = ()
+        INTERP_MODE = self._interpolation_mode_strategy()
+        for img in images:
+            # We hard-code antialias=False to preserve results after we changed
+            # its default from None to True (see
+            # https://github.com/pytorch/vision/pull/7160)
+            # TODO: we could re-train the stereo models with antialias=True?
+            resized_images += (F.resize(img, self.resize_size, interpolation=INTERP_MODE, antialias=False),)
+        for dsp in disparities:
+            if dsp is not None:
+                # rescale disparity to match the new image size
+                scale_x = self.resize_size[1] / dsp.shape[-1]
+                resized_disparities += (F.resize(dsp, self.resize_size, interpolation=INTERP_MODE) * scale_x,)
+            else:
+                resized_disparities += (None,)
+        for mask in masks:
+            if mask is not None:
+                resized_masks += (
+                    # we squeeze and unsqueeze because the API requires > 3D tensors
+                    F.resize(
+                        mask.unsqueeze(0),
+                        self.resize_size,
+                        interpolation=F.InterpolationMode.NEAREST,
+                    ).squeeze(0),
+                )
+            else:
+                resized_masks += (None,)
+        return resized_images, resized_disparities, resized_masks
+class RandomRescaleAndCrop(torch.nn.Module):
+    # This transform will resize the input with a given proba, and then crop it.
+    # These are the reversed operations of the built-in RandomResizedCrop,
+    # although the order of the operations doesn't matter too much: resizing a
+    # crop would give the same result as cropping a resized image, up to
+    # interpolation artifact at the borders of the output.
+    #
+    # The reason we don't rely on RandomResizedCrop is because of a significant
+    # difference in the parametrization of both transforms, in particular,
+    # because of the way the random parameters are sampled in both transforms,
+    # which leads to fairly different results (and different epe). For more details see
+    # https://github.com/pytorch/vision/pull/5026/files#r762932579
+    def __init__(
+        self,
+        crop_size: Tuple[int, int],
+        scale_range: Tuple[float, float] = (-0.2, 0.5),
+        rescale_prob: float = 0.8,
+        scaling_type: str = "exponential",
+        interpolation_type: str = "bilinear",
+    ) -> None:
+        super().__init__()
+        self.crop_size = crop_size
+        self.min_scale = scale_range[0]
+        self.max_scale = scale_range[1]
+        self.rescale_prob = rescale_prob
+        self.scaling_type = scaling_type
+        self._interpolation_mode_strategy = InterpolationStrategy(interpolation_type)
+        if self.scaling_type == "linear" and self.min_scale < 0:
+            raise ValueError("min_scale must be >= 0 for linear scaling")
+    def forward(
+        self,
+        images: T_STEREO_TENSOR,
+        disparities: Tuple[T_FLOW, T_FLOW],
+        masks: Tuple[T_MASK, T_MASK],
+    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
+        img_left, img_right = images
+        dsp_left, dsp_right = disparities
+        mask_left, mask_right = masks
+        INTERP_MODE = self._interpolation_mode_strategy()
+        # randomly sample scale
+        h, w = img_left.shape[-2:]
+        # Note: in original code, they use + 1 instead of + 8 for sparse datasets (e.g. Kitti)
+        # It shouldn't matter much
+        min_scale = max((self.crop_size[0] + 8) / h, (self.crop_size[1] + 8) / w)
+        # exponential scaling will draw a random scale in (min_scale, max_scale) and then raise
+        # 2 to the power of that random value. This final scale distribution will have a different
+        # mean and variance than a uniform distribution. Note that a scale of 1 will result in
+        # a rescaling of 2X the original size, whereas a scale of -1 will result in a rescaling
+        # of 0.5X the original size.
+        if self.scaling_type == "exponential":
+            scale = 2 ** torch.empty(1, dtype=torch.float32).uniform_(self.min_scale, self.max_scale).item()
+        # linear scaling will draw a random scale in (min_scale, max_scale)
+        elif self.scaling_type == "linear":
+            scale = torch.empty(1, dtype=torch.float32).uniform_(self.min_scale, self.max_scale).item()
+        scale = max(scale, min_scale)
+        new_h, new_w = round(h * scale), round(w * scale)
+        if torch.rand(1).item() < self.rescale_prob:
+            # rescale the images
+            img_left = F.resize(img_left, size=(new_h, new_w), interpolation=INTERP_MODE)
+            img_right = F.resize(img_right, size=(new_h, new_w), interpolation=INTERP_MODE)
+            resized_masks, resized_disparities = (), ()
+            for disparity, mask in zip(disparities, masks):
+                if disparity is not None:
+                    if mask is None:
+                        resized_disparity = F.resize(disparity, size=(new_h, new_w), interpolation=INTERP_MODE)
+                        # rescale the disparity
+                        resized_disparity = (
+                            resized_disparity * torch.tensor([scale], device=resized_disparity.device)[:, None, None]
+                        )
+                        resized_mask = None
+                    else:
+                        resized_disparity, resized_mask = _resize_sparse_flow(
+                            disparity, mask, scale_x=scale, scale_y=scale
+                        )
+                resized_masks += (resized_mask,)
+                resized_disparities += (resized_disparity,)
+        else:
+            resized_disparities = disparities
+            resized_masks = masks
+        disparities = resized_disparities
+        masks = resized_masks
+        # Note: For sparse datasets (Kitti), the original code uses a "margin"
+        # See e.g. https://github.com/princeton-vl/RAFT/blob/master/core/utils/augmentor.py#L220:L220
+        # We don't, not sure if it matters much
+        y0 = torch.randint(0, img_left.shape[1] - self.crop_size[0], size=(1,)).item()
+        x0 = torch.randint(0, img_right.shape[2] - self.crop_size[1], size=(1,)).item()
+        img_left = F.crop(img_left, y0, x0, self.crop_size[0], self.crop_size[1])
+        img_right = F.crop(img_right, y0, x0, self.crop_size[0], self.crop_size[1])
+        if dsp_left is not None:
+            dsp_left = F.crop(disparities[0], y0, x0, self.crop_size[0], self.crop_size[1])
+        if dsp_right is not None:
+            dsp_right = F.crop(disparities[1], y0, x0, self.crop_size[0], self.crop_size[1])
+        cropped_masks = ()
+        for mask in masks:
+            if mask is not None:
+                mask = F.crop(mask, y0, x0, self.crop_size[0], self.crop_size[1])
+            cropped_masks += (mask,)
+        return ((img_left, img_right), (dsp_left, dsp_right), cropped_masks)
+def _resize_sparse_flow(
+    flow: Tensor, valid_flow_mask: Tensor, scale_x: float = 1.0, scale_y: float = 0.0
+) -> Tuple[Tensor, Tensor]:
+    # This resizes both the flow and the valid_flow_mask mask (which is assumed to be reasonably sparse)
+    # There are as-many non-zero values in the original flow as in the resized flow (up to OOB)
+    # So for example if scale_x = scale_y = 2, the sparsity of the output flow is multiplied by 4
+    h, w = flow.shape[-2:]
+    h_new = int(round(h * scale_y))
+    w_new = int(round(w * scale_x))
+    flow_new = torch.zeros(size=[1, h_new, w_new], dtype=flow.dtype)
+    valid_new = torch.zeros(size=[h_new, w_new], dtype=valid_flow_mask.dtype)
+    jj, ii = torch.meshgrid(torch.arange(w), torch.arange(h), indexing="xy")
+    ii_valid, jj_valid = ii[valid_flow_mask], jj[valid_flow_mask]
+    ii_valid_new = torch.round(ii_valid.to(float) * scale_y).to(torch.long)
+    jj_valid_new = torch.round(jj_valid.to(float) * scale_x).to(torch.long)
+    within_bounds_mask = (0 <= ii_valid_new) & (ii_valid_new < h_new) & (0 <= jj_valid_new) & (jj_valid_new < w_new)
+    ii_valid = ii_valid[within_bounds_mask]
+    jj_valid = jj_valid[within_bounds_mask]
+    ii_valid_new = ii_valid_new[within_bounds_mask]
+    jj_valid_new = jj_valid_new[within_bounds_mask]
+    valid_flow_new = flow[:, ii_valid, jj_valid]
+    valid_flow_new *= scale_x
+    flow_new[:, ii_valid_new, jj_valid_new] = valid_flow_new
+    valid_new[ii_valid_new, jj_valid_new] = valid_flow_mask[ii_valid, jj_valid]
+    return flow_new, valid_new.bool()
+class Compose(torch.nn.Module):
+    def __init__(self, transforms: List[Callable]):
+        super().__init__()
+        self.transforms = transforms
+    @torch.inference_mode()
+    def forward(self, images, disparities, masks):
+        for t in self.transforms:
+            images, disparities, masks = t(images, disparities, masks)
+        return images, disparities, masks
--- a/references/depth/stereo/utils/losses.py
+++ b/references/depth/stereo/utils/losses.py
+from typing import List, Optional
+import torch
+from torch import nn, Tensor
+from torch.nn import functional as F
+from torchvision.prototype.models.depth.stereo.raft_stereo import grid_sample, make_coords_grid
+def make_gaussian_kernel(kernel_size: int, sigma: float) -> torch.Tensor:
+    """Function to create a 2D Gaussian kernel."""
+    x = torch.arange(kernel_size, dtype=torch.float32)
+    y = torch.arange(kernel_size, dtype=torch.float32)
+    x = x - (kernel_size - 1) / 2
+    y = y - (kernel_size - 1) / 2
+    x, y = torch.meshgrid(x, y)
+    grid = (x**2 + y**2) / (2 * sigma**2)
+    kernel = torch.exp(-grid)
+    kernel = kernel / kernel.sum()
+    return kernel
+def _sequence_loss_fn(
+    flow_preds: List[Tensor],
+    flow_gt: Tensor,
+    valid_flow_mask: Optional[Tensor],
+    gamma: Tensor,
+    max_flow: int = 256,
+    exclude_large: bool = False,
+    weights: Optional[Tensor] = None,
+):
+    """Loss function defined over sequence of flow predictions"""
+    torch._assert(
+        gamma < 1,
+        "sequence_loss: `gamma` must be lower than 1, but got {}".format(gamma),
+    )
+    if exclude_large:
+        # exclude invalid pixels and extremely large diplacements
+        flow_norm = torch.sum(flow_gt**2, dim=1).sqrt()
+        if valid_flow_mask is not None:
+            valid_flow_mask = valid_flow_mask & (flow_norm < max_flow)
+        else:
+            valid_flow_mask = flow_norm < max_flow
+    if valid_flow_mask is not None:
+        valid_flow_mask = valid_flow_mask.unsqueeze(1)
+    flow_preds = torch.stack(flow_preds)  # shape = (num_flow_updates, batch_size, 2, H, W)
+    abs_diff = (flow_preds - flow_gt).abs()
+    if valid_flow_mask is not None:
+        abs_diff = abs_diff * valid_flow_mask.unsqueeze(0)
+    abs_diff = abs_diff.mean(axis=(1, 2, 3, 4))
+    num_predictions = flow_preds.shape[0]
+    # allocating on CPU and moving to device during run-time can force
+    # an unwanted GPU synchronization that produces a large overhead
+    if weights is None or len(weights) != num_predictions:
+        weights = gamma ** torch.arange(num_predictions - 1, -1, -1, device=flow_preds.device, dtype=flow_preds.dtype)
+    flow_loss = (abs_diff * weights).sum()
+    return flow_loss, weights
+class SequenceLoss(nn.Module):
+    def __init__(self, gamma: float = 0.8, max_flow: int = 256, exclude_large_flows: bool = False) -> None:
+        """
+        Args:
+            gamma: value for the exponential weighting of the loss across frames
+            max_flow: maximum flow value to exclude
+            exclude_large_flows: whether to exclude large flows
+        """
+        super().__init__()
+        self.max_flow = max_flow
+        self.excluding_large = exclude_large_flows
+        self.register_buffer("gamma", torch.tensor([gamma]))
+        # cache the scale factor for the loss
+        self._weights = None
+    def forward(self, flow_preds: List[Tensor], flow_gt: Tensor, valid_flow_mask: Optional[Tensor]) -> Tensor:
+        """
+        Args:
+            flow_preds: list of flow predictions of shape (batch_size, C, H, W)
+            flow_gt: ground truth flow of shape (batch_size, C, H, W)
+            valid_flow_mask: mask of valid flow pixels of shape (batch_size, H, W)
+        """
+        loss, weights = _sequence_loss_fn(
+            flow_preds, flow_gt, valid_flow_mask, self.gamma, self.max_flow, self.excluding_large, self._weights
+        )
+        self._weights = weights
+        return loss
+    def set_gamma(self, gamma: float) -> None:
+        self.gamma.fill_(gamma)
+        # reset the cached scale factor
+        self._weights = None
+def _ssim_loss_fn(
+    source: Tensor,
+    reference: Tensor,
+    kernel: Tensor,
+    eps: float = 1e-8,
+    c1: float = 0.01**2,
+    c2: float = 0.03**2,
+    use_padding: bool = False,
+) -> Tensor:
+    # ref: Algorithm section: https://en.wikipedia.org/wiki/Structural_similarity
+    # ref: Alternative implementation: https://kornia.readthedocs.io/en/latest/_modules/kornia/metrics/ssim.html#ssim
+    torch._assert(
+        source.ndim == reference.ndim == 4,
+        "SSIM: `source` and `reference` must be 4-dimensional tensors",
+    )
+    torch._assert(
+        source.shape == reference.shape,
+        "SSIM: `source` and `reference` must have the same shape, but got {} and {}".format(
+            source.shape, reference.shape
+        ),
+    )
+    B, C, H, W = source.shape
+    kernel = kernel.unsqueeze(0).unsqueeze(0).repeat(C, 1, 1, 1)
+    if use_padding:
+        pad_size = kernel.shape[2] // 2
+        source = F.pad(source, (pad_size, pad_size, pad_size, pad_size), "reflect")
+        reference = F.pad(reference, (pad_size, pad_size, pad_size, pad_size), "reflect")
+    mu1 = F.conv2d(source, kernel, groups=C)
+    mu2 = F.conv2d(reference, kernel, groups=C)
+    mu1_sq = mu1.pow(2)
+    mu2_sq = mu2.pow(2)
+    mu1_mu2 = mu1 * mu2
+    mu_img1_sq = F.conv2d(source.pow(2), kernel, groups=C)
+    mu_img2_sq = F.conv2d(reference.pow(2), kernel, groups=C)
+    mu_img1_mu2 = F.conv2d(source * reference, kernel, groups=C)
+    sigma1_sq = mu_img1_sq - mu1_sq
+    sigma2_sq = mu_img2_sq - mu2_sq
+    sigma12 = mu_img1_mu2 - mu1_mu2
+    numerator = (2 * mu1_mu2 + c1) * (2 * sigma12 + c2)
+    denominator = (mu1_sq + mu2_sq + c1) * (sigma1_sq + sigma2_sq + c2)
+    ssim = numerator / (denominator + eps)
+    # doing 1 - ssim because we want to maximize the ssim
+    return 1 - ssim.mean(dim=(1, 2, 3))
+class SSIM(nn.Module):
+    def __init__(
+        self,
+        kernel_size: int = 11,
+        max_val: float = 1.0,
+        sigma: float = 1.5,
+        eps: float = 1e-12,
+        use_padding: bool = True,
+    ) -> None:
+        """SSIM loss function.
+        Args:
+            kernel_size: size of the Gaussian kernel
+            max_val: constant scaling factor
+            sigma: sigma of the Gaussian kernel
+            eps: constant for division by zero
+            use_padding: whether to pad the input tensor such that we have a score for each pixel
+        """
+        super().__init__()
+        self.kernel_size = kernel_size
+        self.max_val = max_val
+        self.sigma = sigma
+        gaussian_kernel = make_gaussian_kernel(kernel_size, sigma)
+        self.register_buffer("gaussian_kernel", gaussian_kernel)
+        self.c1 = (0.01 * self.max_val) ** 2
+        self.c2 = (0.03 * self.max_val) ** 2
+        self.use_padding = use_padding
+        self.eps = eps
+    def forward(self, source: torch.Tensor, reference: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            source: source image of shape (batch_size, C, H, W)
+            reference: reference image of shape (batch_size, C, H, W)
+        Returns:
+            SSIM loss of shape (batch_size,)
+        """
+        return _ssim_loss_fn(
+            source,
+            reference,
+            kernel=self.gaussian_kernel,
+            c1=self.c1,
+            c2=self.c2,
+            use_padding=self.use_padding,
+            eps=self.eps,
+        )
+def _smoothness_loss_fn(img_gx: Tensor, img_gy: Tensor, val_gx: Tensor, val_gy: Tensor):
+    # ref: https://github.com/nianticlabs/monodepth2/blob/b676244e5a1ca55564eb5d16ab521a48f823af31/layers.py#L202
+    torch._assert(
+        img_gx.ndim >= 3,
+        "smoothness_loss: `img_gx` must be at least 3-dimensional tensor of shape (..., C, H, W)",
+    )
+    torch._assert(
+        img_gx.ndim == val_gx.ndim,
+        "smoothness_loss: `img_gx` and `depth_gx` must have the same dimensionality, but got {} and {}".format(
+            img_gx.ndim, val_gx.ndim
+        ),
+    )
+    for idx in range(img_gx.ndim):
+        torch._assert(
+            (img_gx.shape[idx] == val_gx.shape[idx] or (img_gx.shape[idx] == 1 or val_gx.shape[idx] == 1)),
+            "smoothness_loss: `img_gx` and `depth_gx` must have either the same shape or broadcastable shape, but got {} and {}".format(
+                img_gx.shape, val_gx.shape
+            ),
+        )
+    # -3 is channel dimension
+    weights_x = torch.exp(-torch.mean(torch.abs(val_gx), axis=-3, keepdim=True))
+    weights_y = torch.exp(-torch.mean(torch.abs(val_gy), axis=-3, keepdim=True))
+    smoothness_x = img_gx * weights_x
+    smoothness_y = img_gy * weights_y
+    smoothness = (torch.abs(smoothness_x) + torch.abs(smoothness_y)).mean(axis=(-3, -2, -1))
+    return smoothness
+class SmoothnessLoss(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+    def _x_gradient(self, img: Tensor) -> Tensor:
+        if img.ndim > 4:
+            original_shape = img.shape
+            is_reshaped = True
+            img = img.reshape(-1, *original_shape[-3:])
+        else:
+            is_reshaped = False
+        padded = F.pad(img, (0, 1, 0, 0), mode="replicate")
+        grad = padded[..., :, :-1] - padded[..., :, 1:]
+        if is_reshaped:
+            grad = grad.reshape(original_shape)
+        return grad
+    def _y_gradient(self, x: torch.Tensor) -> torch.Tensor:
+        if x.ndim > 4:
+            original_shape = x.shape
+            is_reshaped = True
+            x = x.reshape(-1, *original_shape[-3:])
+        else:
+            is_reshaped = False
+        padded = F.pad(x, (0, 0, 0, 1), mode="replicate")
+        grad = padded[..., :-1, :] - padded[..., 1:, :]
+        if is_reshaped:
+            grad = grad.reshape(original_shape)
+        return grad
+    def forward(self, images: Tensor, vals: Tensor) -> Tensor:
+        """
+        Args:
+            images: tensor of shape (D1, D2, ..., DN, C, H, W)
+            vals: tensor of shape (D1, D2, ..., DN, 1, H, W)
+        Returns:
+            smoothness loss of shape (D1, D2, ..., DN)
+        """
+        img_gx = self._x_gradient(images)
+        img_gy = self._y_gradient(images)
+        val_gx = self._x_gradient(vals)
+        val_gy = self._y_gradient(vals)
+        return _smoothness_loss_fn(img_gx, img_gy, val_gx, val_gy)
+def _flow_sequence_consistency_loss_fn(
+    flow_preds: List[Tensor],
+    gamma: float = 0.8,
+    resize_factor: float = 0.25,
+    rescale_factor: float = 0.25,
+    rescale_mode: str = "bilinear",
+    weights: Optional[Tensor] = None,
+):
+    """Loss function defined over sequence of flow predictions"""
+    # Simplified version of ref: https://arxiv.org/pdf/2006.11242.pdf
+    # In the original paper, an additional refinement network is used to refine a flow prediction.
+    # Each step performed by the recurrent module in Raft or CREStereo is a refinement step using a delta_flow update.
+    # which should be consistent with the previous step. In this implementation, we simplify the overall loss
+    # term and ignore left-right consistency loss or photometric loss which can be treated separately.
+    torch._assert(
+        rescale_factor <= 1.0,
+        "sequence_consistency_loss: `rescale_factor` must be less than or equal to 1, but got {}".format(
+            rescale_factor
+        ),
+    )
+    flow_preds = torch.stack(flow_preds)  # shape = (num_flow_updates, batch_size, 2, H, W)
+    N, B, C, H, W = flow_preds.shape
+    # rescale flow predictions to account for bilinear upsampling artifacts
+    if rescale_factor:
+        flow_preds = (
+            F.interpolate(
+                flow_preds.view(N * B, C, H, W), scale_factor=resize_factor, mode=rescale_mode, align_corners=True
+            )
+        ) * rescale_factor
+        flow_preds = torch.stack(torch.chunk(flow_preds, N, dim=0), dim=0)
+    # force the next prediction to be similar to the previous prediction
+    abs_diff = (flow_preds[1:] - flow_preds[:-1]).square()
+    abs_diff = abs_diff.mean(axis=(1, 2, 3, 4))
+    num_predictions = flow_preds.shape[0] - 1  # because we are comparing differences
+    if weights is None or len(weights) != num_predictions:
+        weights = gamma ** torch.arange(num_predictions - 1, -1, -1, device=flow_preds.device, dtype=flow_preds.dtype)
+    flow_loss = (abs_diff * weights).sum()
+    return flow_loss, weights
+class FlowSequenceConsistencyLoss(nn.Module):
+    def __init__(
+        self,
+        gamma: float = 0.8,
+        resize_factor: float = 0.25,
+        rescale_factor: float = 0.25,
+        rescale_mode: str = "bilinear",
+    ) -> None:
+        super().__init__()
+        self.gamma = gamma
+        self.resize_factor = resize_factor
+        self.rescale_factor = rescale_factor
+        self.rescale_mode = rescale_mode
+        self._weights = None
+    def forward(self, flow_preds: List[Tensor]) -> Tensor:
+        """
+        Args:
+            flow_preds: list of tensors of shape (batch_size, C, H, W)
+        Returns:
+            sequence consistency loss of shape (batch_size,)
+        """
+        loss, weights = _flow_sequence_consistency_loss_fn(
+            flow_preds,
+            gamma=self.gamma,
+            resize_factor=self.resize_factor,
+            rescale_factor=self.rescale_factor,
+            rescale_mode=self.rescale_mode,
+            weights=self._weights,
+        )
+        self._weights = weights
+        return loss
+    def set_gamma(self, gamma: float) -> None:
+        self.gamma.fill_(gamma)
+        # reset the cached scale factor
+        self._weights = None
+def _psnr_loss_fn(source: torch.Tensor, target: torch.Tensor, max_val: float) -> torch.Tensor:
+    torch._assert(
+        source.shape == target.shape,
+        "psnr_loss: source and target must have the same shape, but got {} and {}".format(source.shape, target.shape),
+    )
+    # ref https://en.wikipedia.org/wiki/Peak_signal-to-noise_ratio
+    return 10 * torch.log10(max_val**2 / ((source - target).pow(2).mean(axis=(-3, -2, -1))))
+class PSNRLoss(nn.Module):
+    def __init__(self, max_val: float = 256) -> None:
+        """
+        Args:
+            max_val: maximum value of the input tensor. This refers to the maximum domain value of the input tensor.
+        """
+        super().__init__()
+        self.max_val = max_val
+    def forward(self, source: Tensor, target: Tensor) -> Tensor:
+        """
+        Args:
+            source: tensor of shape (D1, D2, ..., DN, C, H, W)
+            target: tensor of shape (D1, D2, ..., DN, C, H, W)
+        Returns:
+            psnr loss of shape (D1, D2, ..., DN)
+        """
+        # multiply by -1 as we want to maximize the psnr
+        return -1 * _psnr_loss_fn(source, target, self.max_val)
+class FlowPhotoMetricLoss(nn.Module):
+    def __init__(
+        self,
+        ssim_weight: float = 0.85,
+        ssim_window_size: int = 11,
+        ssim_max_val: float = 1.0,
+        ssim_sigma: float = 1.5,
+        ssim_eps: float = 1e-12,
+        ssim_use_padding: bool = True,
+        max_displacement_ratio: float = 0.15,
+    ) -> None:
+        super().__init__()
+        self._ssim_loss = SSIM(
+            kernel_size=ssim_window_size,
+            max_val=ssim_max_val,
+            sigma=ssim_sigma,
+            eps=ssim_eps,
+            use_padding=ssim_use_padding,
+        )
+        self._L1_weight = 1 - ssim_weight
+        self._SSIM_weight = ssim_weight
+        self._max_displacement_ratio = max_displacement_ratio
+    def forward(
+        self,
+        source: Tensor,
+        reference: Tensor,
+        flow_pred: Tensor,
+        valid_mask: Optional[Tensor] = None,
+    ):
+        """
+        Args:
+            source: tensor of shape (B, C, H, W)
+            reference: tensor of shape (B, C, H, W)
+            flow_pred: tensor of shape (B, 2, H, W)
+            valid_mask: tensor of shape (B, H, W) or None
+        Returns:
+            photometric loss of shape
+        """
+        torch._assert(
+            source.ndim == 4,
+            "FlowPhotoMetricLoss: source must have 4 dimensions, but got {}".format(source.ndim),
+        )
+        torch._assert(
+            reference.ndim == source.ndim,
+            "FlowPhotoMetricLoss: source and other must have the same number of dimensions, but got {} and {}".format(
+                source.ndim, reference.ndim
+            ),
+        )
+        torch._assert(
+            flow_pred.shape[1] == 2,
+            "FlowPhotoMetricLoss: flow_pred must have 2 channels, but got {}".format(flow_pred.shape[1]),
+        )
+        torch._assert(
+            flow_pred.ndim == 4,
+            "FlowPhotoMetricLoss: flow_pred must have 4 dimensions, but got {}".format(flow_pred.ndim),
+        )
+        B, C, H, W = source.shape
+        flow_channels = flow_pred.shape[1]
+        max_displacements = []
+        for dim in range(flow_channels):
+            shape_index = -1 - dim
+            max_displacements.append(int(self._max_displacement_ratio * source.shape[shape_index]))
+        # mask out all pixels that have larger flow than the max flow allowed
+        max_flow_mask = torch.logical_and(
+            *[flow_pred[:, dim, :, :] < max_displacements[dim] for dim in range(flow_channels)]
+        )
+        if valid_mask is not None:
+            valid_mask = torch.logical_and(valid_mask, max_flow_mask).unsqueeze(1)
+        else:
+            valid_mask = max_flow_mask.unsqueeze(1)
+        grid = make_coords_grid(B, H, W, device=str(source.device))
+        resampled_grids = grid - flow_pred
+        resampled_grids = resampled_grids.permute(0, 2, 3, 1)
+        resampled_source = grid_sample(reference, resampled_grids, mode="bilinear")
+        # compute SSIM loss
+        ssim_loss = self._ssim_loss(resampled_source * valid_mask, source * valid_mask)
+        l1_loss = (resampled_source * valid_mask - source * valid_mask).abs().mean(axis=(-3, -2, -1))
+        loss = self._L1_weight * l1_loss + self._SSIM_weight * ssim_loss
+        return loss.mean()
--- a/references/depth/stereo/utils/metrics.py
+++ b/references/depth/stereo/utils/metrics.py
+from typing import Dict, List, Optional, Tuple
+from torch import Tensor
+AVAILABLE_METRICS = ["mae", "rmse", "epe", "bad1", "bad2", "epe", "1px", "3px", "5px", "fl-all", "relepe"]
+def compute_metrics(
+    flow_pred: Tensor, flow_gt: Tensor, valid_flow_mask: Optional[Tensor], metrics: List[str]
+) -> Tuple[Dict[str, float], int]:
+    for m in metrics:
+        if m not in AVAILABLE_METRICS:
+            raise ValueError(f"Invalid metric: {m}. Valid metrics are: {AVAILABLE_METRICS}")
+    metrics_dict = {}
+    pixels_diffs = (flow_pred - flow_gt).abs()
+    # there is no Y flow in Stereo Matching, therefore flow.abs() = flow.pow(2).sum(dim=1).sqrt()
+    flow_norm = flow_gt.abs()
+    if valid_flow_mask is not None:
+        valid_flow_mask = valid_flow_mask.unsqueeze(1)
+        pixels_diffs = pixels_diffs[valid_flow_mask]
+        flow_norm = flow_norm[valid_flow_mask]
+    num_pixels = pixels_diffs.numel()
+    if "bad1" in metrics:
+        metrics_dict["bad1"] = (pixels_diffs > 1).float().mean().item()
+    if "bad2" in metrics:
+        metrics_dict["bad2"] = (pixels_diffs > 2).float().mean().item()
+    if "mae" in metrics:
+        metrics_dict["mae"] = pixels_diffs.mean().item()
+    if "rmse" in metrics:
+        metrics_dict["rmse"] = pixels_diffs.pow(2).mean().sqrt().item()
+    if "epe" in metrics:
+        metrics_dict["epe"] = pixels_diffs.mean().item()
+    if "1px" in metrics:
+        metrics_dict["1px"] = (pixels_diffs < 1).float().mean().item()
+    if "3px" in metrics:
+        metrics_dict["3px"] = (pixels_diffs < 3).float().mean().item()
+    if "5px" in metrics:
+        metrics_dict["5px"] = (pixels_diffs < 5).float().mean().item()
+    if "fl-all" in metrics:
+        metrics_dict["fl-all"] = ((pixels_diffs < 3) & ((pixels_diffs / flow_norm) < 0.05)).float().mean().item() * 100
+    if "relepe" in metrics:
+        metrics_dict["relepe"] = (pixels_diffs / flow_norm).mean().item()
+    return metrics_dict, num_pixels
--- a/references/depth/stereo/visualization.py
+++ b/references/depth/stereo/visualization.py
+import os
+from typing import List
+import numpy as np
+import torch
+from torch import Tensor
+from torchvision.utils import make_grid
+@torch.no_grad()
+def make_disparity_image(disparity: Tensor):
+    # normalize image to [0, 1]
+    disparity = disparity.detach().cpu()
+    disparity = (disparity - disparity.min()) / (disparity.max() - disparity.min())
+    return disparity
+@torch.no_grad()
+def make_disparity_image_pairs(disparity: Tensor, image: Tensor):
+    disparity = make_disparity_image(disparity)
+    # image is in [-1, 1], bring it to [0, 1]
+    image = image.detach().cpu()
+    image = image * 0.5 + 0.5
+    return disparity, image
+@torch.no_grad()
+def make_disparity_sequence(disparities: List[Tensor]):
+    # convert each disparity to [0, 1]
+    for idx, disparity_batch in enumerate(disparities):
+        disparities[idx] = torch.stack(list(map(make_disparity_image, disparity_batch)))
+    # make the list into a batch
+    disparity_sequences = torch.stack(disparities)
+    return disparity_sequences
+@torch.no_grad()
+def make_pair_grid(*inputs, orientation="horizontal"):
+    # make a grid of images with the outputs and references side by side
+    if orientation == "horizontal":
+        # interleave the outputs and references
+        canvas = torch.zeros_like(inputs[0])
+        canvas = torch.cat([canvas] * len(inputs), dim=0)
+        size = len(inputs)
+        for idx, inp in enumerate(inputs):
+            canvas[idx::size, ...] = inp
+        grid = make_grid(canvas, nrow=len(inputs), padding=16, normalize=True, scale_each=True)
+    elif orientation == "vertical":
+        # interleave the outputs and references
+        canvas = torch.cat(inputs, dim=0)
+        size = len(inputs)
+        for idx, inp in enumerate(inputs):
+            canvas[idx::size, ...] = inp
+        grid = make_grid(canvas, nrow=len(inputs[0]), padding=16, normalize=True, scale_each=True)
+    else:
+        raise ValueError("Unknown orientation: {}".format(orientation))
+    return grid
+@torch.no_grad()
+def make_training_sample_grid(
+    left_images: Tensor,
+    right_images: Tensor,
+    disparities: Tensor,
+    masks: Tensor,
+    predictions: List[Tensor],
+) -> np.ndarray:
+    # detach images and renormalize to [0, 1]
+    images_left = left_images.detach().cpu() * 0.5 + 0.5
+    images_right = right_images.detach().cpu() * 0.5 + 0.5
+    # detach the disparties and predictions
+    disparities = disparities.detach().cpu()
+    predictions = predictions[-1].detach().cpu()
+    # keep only the first channel of pixels, and repeat it 3 times
+    disparities = disparities[:, :1, ...].repeat(1, 3, 1, 1)
+    predictions = predictions[:, :1, ...].repeat(1, 3, 1, 1)
+    # unsqueeze and repeat the masks
+    masks = masks.detach().cpu().unsqueeze(1).repeat(1, 3, 1, 1)
+    # make a grid that will self normalize across the batch
+    pred_grid = make_pair_grid(images_left, images_right, masks, disparities, predictions, orientation="horizontal")
+    pred_grid = pred_grid.permute(1, 2, 0).numpy()
+    pred_grid = (pred_grid * 255).astype(np.uint8)
+    return pred_grid
+@torch.no_grad()
+def make_disparity_sequence_grid(predictions: List[Tensor], disparities: Tensor) -> np.ndarray:
+    # right most we will be adding the ground truth
+    seq_len = len(predictions) + 1
+    predictions = list(map(lambda x: x[:, :1, :, :].detach().cpu(), predictions + [disparities]))
+    sequence = make_disparity_sequence(predictions)
+    # swap axes to have the in the correct order for each batch sample
+    sequence = torch.swapaxes(sequence, 0, 1).contiguous().reshape(-1, 1, disparities.shape[-2], disparities.shape[-1])
+    sequence = make_grid(sequence, nrow=seq_len, padding=16, normalize=True, scale_each=True)
+    sequence = sequence.permute(1, 2, 0).numpy()
+    sequence = (sequence * 255).astype(np.uint8)
+    return sequence
+@torch.no_grad()
+def make_prediction_image_side_to_side(
+    predictions: Tensor, disparities: Tensor, valid_mask: Tensor, save_path: str, prefix: str
+) -> None:
+    import matplotlib.pyplot as plt
+    # normalize the predictions and disparities in [0, 1]
+    predictions = (predictions - predictions.min()) / (predictions.max() - predictions.min())
+    disparities = (disparities - disparities.min()) / (disparities.max() - disparities.min())
+    predictions = predictions * valid_mask
+    disparities = disparities * valid_mask
+    predictions = predictions.detach().cpu()
+    disparities = disparities.detach().cpu()
+    for idx, (pred, gt) in enumerate(zip(predictions, disparities)):
+        pred = pred.permute(1, 2, 0).numpy()
+        gt = gt.permute(1, 2, 0).numpy()
+        # plot pred and gt side by side
+        fig, ax = plt.subplots(1, 2, figsize=(10, 5))
+        ax[0].imshow(pred)
+        ax[0].set_title("Prediction")
+        ax[1].imshow(gt)
+        ax[1].set_title("Ground Truth")
+        save_name = os.path.join(save_path, "{}_{}.png".format(prefix, idx))
+        plt.savefig(save_name)
+        plt.close()
--- a/references/detection/coco_utils.py
+++ b/references/detection/coco_utils.py
-import copy
 import os
 import torch
@@ -9,24 +8,6 @@ from pycocotools import mask as coco_mask
 from pycocotools.coco import COCO
-class FilterAndRemapCocoCategories:
-    def __init__(self, categories, remap=True):
-        self.categories = categories
-        self.remap = remap
-    def __call__(self, image, target):
-        anno = target["annotations"]
-        anno = [obj for obj in anno if obj["category_id"] in self.categories]
-        if not self.remap:
-            target["annotations"] = anno
-            return image, target
-        anno = copy.deepcopy(anno)
-        for obj in anno:
-            obj["category_id"] = self.categories.index(obj["category_id"])
-        target["annotations"] = anno
-        return image, target
 def convert_coco_poly_to_mask(segmentations, height, width):
    masks = []
    for polygons in segmentations:
@@ -49,7 +30,6 @@ class ConvertCocoPolysToMask:
        w, h = image.size
        image_id = target["image_id"]
-        image_id = torch.tensor([image_id])
        anno = target["annotations"]
@@ -116,7 +96,7 @@ def _coco_remove_images_without_annotations(dataset, cat_list=None):
        # if all boxes have close to zero area, there is no annotation
        if _has_only_empty_bbox(anno):
            return False
-        # keypoints task have a slight different critera for considering
+        # keypoints task have a slight different criteria for considering
        # if an annotation is valid
        if "keypoints" not in anno[0]:
            return True
@@ -126,10 +106,6 @@ def _coco_remove_images_without_annotations(dataset, cat_list=None):
            return True
        return False
-    if not isinstance(dataset, torchvision.datasets.CocoDetection):
-        raise TypeError(
-            f"This function expects dataset of type torchvision.datasets.CocoDetection, instead  got {type(dataset)}"
-        )
    ids = []
    for ds_idx, img_id in enumerate(dataset.ids):
        ann_ids = dataset.coco.getAnnIds(imgIds=img_id, iscrowd=None)
@@ -153,7 +129,7 @@ def convert_to_coco_api(ds):
        # find better way to get target
        # targets = ds.get_annotations(img_idx)
        img, targets = ds[img_idx]
-        image_id = targets["image_id"].item()
+        image_id = targets["image_id"]
        img_dict = {}
        img_dict["id"] = image_id
        img_dict["height"] = img.shape[-2]
@@ -196,6 +172,7 @@ def convert_to_coco_api(ds):
 def get_coco_api_from_dataset(dataset):
+    # FIXME: This is... awful?
    for _ in range(10):
        if isinstance(dataset, torchvision.datasets.CocoDetection):
            break
@@ -220,7 +197,7 @@ class CocoDetection(torchvision.datasets.CocoDetection):
        return img, target
-def get_coco(root, image_set, transforms, mode="instances"):
+def get_coco(root, image_set, transforms, mode="instances", use_v2=False, with_masks=False):
    anno_file_template = "{}_{}2017.json"
    PATHS = {
        "train": ("train2017", os.path.join("annotations", anno_file_template.format(mode, "train"))),
@@ -228,17 +205,26 @@ def get_coco(root, image_set, transforms, mode="instances"):
        # "train": ("val2017", os.path.join("annotations", anno_file_template.format(mode, "val")))
    }
-    t = [ConvertCocoPolysToMask()]
-    if transforms is not None:
-        t.append(transforms)
-    transforms = T.Compose(t)
    img_folder, ann_file = PATHS[image_set]
    img_folder = os.path.join(root, img_folder)
    ann_file = os.path.join(root, ann_file)
-    dataset = CocoDetection(img_folder, ann_file, transforms=transforms)
+    if use_v2:
+        from torchvision.datasets import wrap_dataset_for_transforms_v2
+        dataset = torchvision.datasets.CocoDetection(img_folder, ann_file, transforms=transforms)
+        target_keys = ["boxes", "labels", "image_id"]
+        if with_masks:
+            target_keys += ["masks"]
+        dataset = wrap_dataset_for_transforms_v2(dataset, target_keys=target_keys)
+    else:
+        # TODO: handle with_masks for V1?
+        t = [ConvertCocoPolysToMask()]
+        if transforms is not None:
+            t.append(transforms)
+        transforms = T.Compose(t)
+        dataset = CocoDetection(img_folder, ann_file, transforms=transforms)
    if image_set == "train":
        dataset = _coco_remove_images_without_annotations(dataset)
@@ -246,7 +232,3 @@ def get_coco(root, image_set, transforms, mode="instances"):
    # dataset = torch.utils.data.Subset(dataset, [i for i in range(500)])
    return dataset
-def get_coco_kp(root, image_set, transforms):
-    return get_coco(root, image_set, transforms, mode="person_keypoints")
--- a/references/detection/engine.py
+++ b/references/detection/engine.py
@@ -26,7 +26,7 @@ def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq, sc
    for images, targets in metric_logger.log_every(data_loader, print_freq, header):
        images = list(image.to(device) for image in images)
-        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
+        targets = [{k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in t.items()} for t in targets]
        with torch.cuda.amp.autocast(enabled=scaler is not None):
            loss_dict = model(images, targets)
            losses = sum(loss for loss in loss_dict.values())
@@ -97,7 +97,7 @@ def evaluate(model, data_loader, device):
        outputs = [{k: v.to(cpu_device) for k, v in t.items()} for t in outputs]
        model_time = time.time() - model_time
-        res = {target["image_id"].item(): output for target, output in zip(targets, outputs)}
+        res = {target["image_id"]: output for target, output in zip(targets, outputs)}
        evaluator_time = time.time()
        coco_evaluator.update(res)
        evaluator_time = time.time() - evaluator_time

--- a/references/detection/group_by_aspect_ratio.py
+++ b/references/detection/group_by_aspect_ratio.py
@@ -63,7 +63,7 @@ class GroupedBatchSampler(BatchSampler):
        expected_num_batches = len(self)
        num_remaining = expected_num_batches - num_batches
        if num_remaining > 0:
-            # for the remaining batches, take first the buffers with largest number
+            # for the remaining batches, take first the buffers with the largest number
            # of elements
            for group_id, _ in sorted(buffer_per_group.items(), key=lambda x: len(x[1]), reverse=True):
                remaining = self.batch_size - len(buffer_per_group[group_id])

--- a/references/detection/presets.py
+++ b/references/detection/presets.py
+from collections import defaultdict
 import torch
-import transforms as T
+import transforms as reference_transforms
+def get_modules(use_v2):
+    # We need a protected import to avoid the V2 warning in case just V1 is used
+    if use_v2:
+        import torchvision.transforms.v2
+        import torchvision.tv_tensors
+        return torchvision.transforms.v2, torchvision.tv_tensors
+    else:
+        return reference_transforms, None
 class DetectionPresetTrain:
-    def __init__(self, *, data_augmentation, hflip_prob=0.5, mean=(123.0, 117.0, 104.0)):
+    # Note: this transform assumes that the input to forward() are always PIL
+    # images, regardless of the backend parameter.
+    def __init__(
+        self,
+        *,
+        data_augmentation,
+        hflip_prob=0.5,
+        mean=(123.0, 117.0, 104.0),
+        backend="pil",
+        use_v2=False,
+    ):
+        T, tv_tensors = get_modules(use_v2)
+        transforms = []
+        backend = backend.lower()
+        if backend == "tv_tensor":
+            transforms.append(T.ToImage())
+        elif backend == "tensor":
+            transforms.append(T.PILToTensor())
+        elif backend != "pil":
+            raise ValueError(f"backend can be 'tv_tensor', 'tensor' or 'pil', but got {backend}")
        if data_augmentation == "hflip":
-            self.transforms = T.Compose(
+            transforms += [T.RandomHorizontalFlip(p=hflip_prob)]
-                [
-                    T.RandomHorizontalFlip(p=hflip_prob),
-                    T.PILToTensor(),
-                    T.ConvertImageDtype(torch.float),
-                ]
-            )
        elif data_augmentation == "lsj":
-            self.transforms = T.Compose(
+            transforms += [
-                [
+                T.ScaleJitter(target_size=(1024, 1024), antialias=True),
-                    T.ScaleJitter(target_size=(1024, 1024)),
+                # TODO: FixedSizeCrop below doesn't work on tensors!
-                    T.FixedSizeCrop(size=(1024, 1024), fill=mean),
+                reference_transforms.FixedSizeCrop(size=(1024, 1024), fill=mean),
-                    T.RandomHorizontalFlip(p=hflip_prob),
+                T.RandomHorizontalFlip(p=hflip_prob),
-                    T.PILToTensor(),
+            ]
-                    T.ConvertImageDtype(torch.float),
-                ]
-            )
        elif data_augmentation == "multiscale":
-            self.transforms = T.Compose(
+            transforms += [
-                [
+                T.RandomShortestSize(min_size=(480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800), max_size=1333),
-                    T.RandomShortestSize(
+                T.RandomHorizontalFlip(p=hflip_prob),
-                        min_size=(480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800), max_size=1333
+            ]
-                    ),
-                    T.RandomHorizontalFlip(p=hflip_prob),
-                    T.PILToTensor(),
-                    T.ConvertImageDtype(torch.float),
-                ]
-            )
        elif data_augmentation == "ssd":
-            self.transforms = T.Compose(
+            fill = defaultdict(lambda: mean, {tv_tensors.Mask: 0}) if use_v2 else list(mean)
-                [
+            transforms += [
-                    T.RandomPhotometricDistort(),
+                T.RandomPhotometricDistort(),
-                    T.RandomZoomOut(fill=list(mean)),
+                T.RandomZoomOut(fill=fill),
-                    T.RandomIoUCrop(),
+                T.RandomIoUCrop(),
-                    T.RandomHorizontalFlip(p=hflip_prob),
+                T.RandomHorizontalFlip(p=hflip_prob),
-                    T.PILToTensor(),
+            ]
-                    T.ConvertImageDtype(torch.float),
-                ]
-            )
        elif data_augmentation == "ssdlite":
-            self.transforms = T.Compose(
+            transforms += [
-                [
+                T.RandomIoUCrop(),
-                    T.RandomIoUCrop(),
+                T.RandomHorizontalFlip(p=hflip_prob),
-                    T.RandomHorizontalFlip(p=hflip_prob),
+            ]
-                    T.PILToTensor(),
-                    T.ConvertImageDtype(torch.float),
-                ]
-            )
        else:
            raise ValueError(f'Unknown data augmentation policy "{data_augmentation}"')
+        if backend == "pil":
+            # Note: we could just convert to pure tensors even in v2.
+            transforms += [T.ToImage() if use_v2 else T.PILToTensor()]
+        transforms += [T.ToDtype(torch.float, scale=True)]
+        if use_v2:
+            transforms += [
+                T.ConvertBoundingBoxFormat(tv_tensors.BoundingBoxFormat.XYXY),
+                T.SanitizeBoundingBoxes(),
+                T.ToPureTensor(),
+            ]
+        self.transforms = T.Compose(transforms)
    def __call__(self, img, target):
        return self.transforms(img, target)
 class DetectionPresetEval:
-    def __init__(self):
+    def __init__(self, backend="pil", use_v2=False):
-        self.transforms = T.Compose(
+        T, _ = get_modules(use_v2)
-            [
+        transforms = []
-                T.PILToTensor(),
+        backend = backend.lower()
-                T.ConvertImageDtype(torch.float),
+        if backend == "pil":
-            ]
+            # Note: we could just convert to pure tensors even in v2?
-        )
+            transforms += [T.ToImage() if use_v2 else T.PILToTensor()]
+        elif backend == "tensor":
+            transforms += [T.PILToTensor()]
+        elif backend == "tv_tensor":
+            transforms += [T.ToImage()]
+        else:
+            raise ValueError(f"backend can be 'tv_tensor', 'tensor' or 'pil', but got {backend}")
+        transforms += [T.ToDtype(torch.float, scale=True)]
+        if use_v2:
+            transforms += [T.ToPureTensor()]
+        self.transforms = T.Compose(transforms)
    def __call__(self, img, target):
        return self.transforms(img, target)
--- a/references/detection/train.py
+++ b/references/detection/train.py
@@ -28,7 +28,7 @@ import torchvision
 import torchvision.models.detection
 import torchvision.models.detection.mask_rcnn
 import utils
-from coco_utils import get_coco, get_coco_kp
+from coco_utils import get_coco
 from engine import evaluate, train_one_epoch
 from group_by_aspect_ratio import create_aspect_ratio_groups, GroupedBatchSampler
 from torchvision.transforms import InterpolationMode
@@ -40,23 +40,32 @@ def copypaste_collate_fn(batch):
    return copypaste(*utils.collate_fn(batch))
-def get_dataset(name, image_set, transform, data_path):
+def get_dataset(is_train, args):
-    paths = {"coco": (data_path, get_coco, 91), "coco_kp": (data_path, get_coco_kp, 2)}
+    image_set = "train" if is_train else "val"
-    p, ds_fn, num_classes = paths[name]
+    num_classes, mode = {"coco": (91, "instances"), "coco_kp": (2, "person_keypoints")}[args.dataset]
+    with_masks = "mask" in args.model
-    ds = ds_fn(p, image_set=image_set, transforms=transform)
+    ds = get_coco(
+        root=args.data_path,
+        image_set=image_set,
+        transforms=get_transform(is_train, args),
+        mode=mode,
+        use_v2=args.use_v2,
+        with_masks=with_masks,
+    )
    return ds, num_classes
-def get_transform(train, args):
+def get_transform(is_train, args):
-    if train:
+    if is_train:
-        return presets.DetectionPresetTrain(data_augmentation=args.data_augmentation)
+        return presets.DetectionPresetTrain(
+            data_augmentation=args.data_augmentation, backend=args.backend, use_v2=args.use_v2
+        )
    elif args.weights and args.test_only:
        weights = torchvision.models.get_weight(args.weights)
        trans = weights.transforms()
        return lambda img, target: (trans(img), target)
    else:
-        return presets.DetectionPresetEval()
+        return presets.DetectionPresetEval(backend=args.backend, use_v2=args.use_v2)
 def get_args_parser(add_help=True):
@@ -65,7 +74,12 @@ def get_args_parser(add_help=True):
    parser = argparse.ArgumentParser(description="PyTorch Detection Training", add_help=add_help)
    parser.add_argument("--data-path", default="/datasets01/COCO/022719/", type=str, help="dataset path")
-    parser.add_argument("--dataset", default="coco", type=str, help="dataset name")
+    parser.add_argument(
+        "--dataset",
+        default="coco",
+        type=str,
+        help="dataset name. Use coco for object detection and instance segmentation and coco_kp for Keypoint detection",
+    )
    parser.add_argument("--model", default="maskrcnn_resnet50_fpn", type=str, help="model name")
    parser.add_argument("--device", default="cuda", type=str, help="device (Use cuda or cpu Default: cuda)")
    parser.add_argument(
@@ -159,10 +173,22 @@ def get_args_parser(add_help=True):
        help="Use CopyPaste data augmentation. Works only with data-augmentation='lsj'.",
    )
+    parser.add_argument("--backend", default="PIL", type=str.lower, help="PIL or tensor - case insensitive")
+    parser.add_argument("--use-v2", action="store_true", help="Use V2 transforms")
    return parser
 def main(args):
+    if args.backend.lower() == "tv_tensor" and not args.use_v2:
+        raise ValueError("Use --use-v2 if you want to use the tv_tensor backend.")
+    if args.dataset not in ("coco", "coco_kp"):
+        raise ValueError(f"Dataset should be coco or coco_kp, got {args.dataset}")
+    if "keypoint" in args.model and args.dataset != "coco_kp":
+        raise ValueError("Oops, if you want Keypoint detection, set --dataset coco_kp")
+    if args.dataset == "coco_kp" and args.use_v2:
+        raise ValueError("KeyPoint detection doesn't support V2 transforms yet")
    if args.output_dir:
        utils.mkdir(args.output_dir)
@@ -177,8 +203,8 @@ def main(args):
    # Data loading code
    print("Loading data")
-    dataset, num_classes = get_dataset(args.dataset, "train", get_transform(True, args), args.data_path)
+    dataset, num_classes = get_dataset(is_train=True, args=args)
-    dataset_test, _ = get_dataset(args.dataset, "val", get_transform(False, args), args.data_path)
+    dataset_test, _ = get_dataset(is_train=False, args=args)
    print("Creating data loaders")
    if args.distributed:

--- a/references/detection/transforms.py
+++ b/references/detection/transforms.py
@@ -53,14 +53,17 @@ class PILToTensor(nn.Module):
        return image, target
-class ConvertImageDtype(nn.Module):
+class ToDtype(nn.Module):
-    def __init__(self, dtype: torch.dtype) -> None:
+    def __init__(self, dtype: torch.dtype, scale: bool = False) -> None:
        super().__init__()
        self.dtype = dtype
+        self.scale = scale
    def forward(
        self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
    ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
+        if not self.scale:
+            return image.to(dtype=self.dtype), target
        image = F.convert_image_dtype(image, self.dtype)
        return image, target
@@ -293,11 +296,13 @@ class ScaleJitter(nn.Module):
        target_size: Tuple[int, int],
        scale_range: Tuple[float, float] = (0.1, 2.0),
        interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+        antialias=True,
    ):
        super().__init__()
        self.target_size = target_size
        self.scale_range = scale_range
        self.interpolation = interpolation
+        self.antialias = antialias
    def forward(
        self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
@@ -315,14 +320,17 @@ class ScaleJitter(nn.Module):
        new_width = int(orig_width * r)
        new_height = int(orig_height * r)
-        image = F.resize(image, [new_height, new_width], interpolation=self.interpolation)
+        image = F.resize(image, [new_height, new_width], interpolation=self.interpolation, antialias=self.antialias)
        if target is not None:
            target["boxes"][:, 0::2] *= new_width / orig_width
            target["boxes"][:, 1::2] *= new_height / orig_height
            if "masks" in target:
                target["masks"] = F.resize(
-                    target["masks"], [new_height, new_width], interpolation=InterpolationMode.NEAREST
+                    target["masks"],
+                    [new_height, new_width],
+                    interpolation=InterpolationMode.NEAREST,
+                    antialias=self.antialias,
                )
        return image, target

--- a/references/optical_flow/README.md
+++ b/references/optical_flow/README.md
@@ -56,7 +56,7 @@ torchrun --nproc_per_node 1 --nnodes 1 train.py --val-dataset sintel --batch-siz
 This should give an epe of about 1.3822 on the clean pass and 2.7161 on the
 final pass of Sintel-train. Results may vary slightly depending on the batch
-size and the number of GPUs. For the most accurate resuts use 1 GPU and
+size and the number of GPUs. For the most accurate results use 1 GPU and
 `--batch-size 1`:
 ```

--- a/references/optical_flow/train.py
+++ b/references/optical_flow/train.py
@@ -82,7 +82,7 @@ def _evaluate(model, args, val_dataset, *, padder_mode, num_flow_updates=None, b
    def inner_loop(blob):
        if blob[0].dim() == 3:
-            # input is not batched so we add an extra dim for consistency
+            # input is not batched, so we add an extra dim for consistency
            blob = [x[None, :, :, :] if x is not None else None for x in blob]
        image1, image2, flow_gt = blob[:3]
@@ -150,7 +150,7 @@ def evaluate(model, args):
    for name in val_datasets:
        if name == "kitti":
-            # Kitti has different image sizes so we need to individually pad them, we can't batch.
+            # Kitti has different image sizes, so we need to individually pad them, we can't batch.
            # see comment in InputPadder
            if args.batch_size != 1 and (not args.distributed or args.rank == 0):
                warnings.warn(